From ca577883ae1db4c675030a50909729e0dd69846c Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Tue, 9 Jun 2026 11:30:45 -0700
Subject: [PATCH] add bmm detect and vdpphps in util/cpuid

Bug: None
Change-Id: I9954f96a74e653e3ecd3fbeba533299fa8e57d95
---
 Android.mk                         |    4 -
 BUILD.gn                           |    7 +
 README.chromium                    |    2 +-
 docs/environment_variables.md      |    1 +
 docs/getting_started.md            |    1 +
 include/libyuv/convert.h           |   39 -
 include/libyuv/convert_from_argb.h |   13 -
 include/libyuv/cpu_id.h            |    1 +
 include/libyuv/row.h               |  660 +++----
 include/libyuv/row_sve.h           |   12 +-
 include/libyuv/version.h           |    2 +-
 libyuv.gyp                         |   12 +
 libyuv.gypi                        |    2 -
 source/compare.cc                  |   12 +-
 source/compare_neon64.cc           |    2 +-
 source/compare_win.cc              |   20 +-
 source/convert.cc                  | 2890 ++++++++++++++--------------
 source/convert_argb.cc             |  507 ++---
 source/convert_from.cc             |   91 +-
 source/convert_from_argb.cc        | 2570 ++++++++++++++++++-------
 source/convert_to_argb.cc          |  221 +--
 source/convert_to_i420.cc          |  123 +-
 source/cpu_id.cc                   |    7 +-
 source/planar_functions.cc         |  666 +++----
 source/rotate.cc                   |  121 +-
 source/rotate_argb.cc              |    7 +-
 source/rotate_common.cc            |    8 +-
 source/rotate_neon.cc              |   20 +-
 source/rotate_neon64.cc            |   20 +-
 source/rotate_win.cc               |   12 +-
 source/row_any.cc                  |  193 +-
 source/row_common.cc               |  680 ++-----
 source/row_gcc.cc                  | 1949 +++++++++----------
 source/row_lasx.cc                 |   36 +-
 source/row_lsx.cc                  |   36 +-
 source/row_neon.cc                 |  379 ++--
 source/row_neon64.cc               |  845 +++++---
 source/row_rvv.cc                  |   32 +-
 source/row_sme.cc                  |    7 +-
 source/row_sve.cc                  |    7 +-
 source/row_win.cc                  |  664 +------
 source/scale.cc                    |  303 +--
 source/scale_argb.cc               |  369 +++-
 source/scale_common.cc             |   26 +-
 source/scale_gcc.cc                |   62 +-
 source/scale_rgb.cc                |    4 +-
 source/scale_uv.cc                 |  132 +-
 source/scale_win.cc                |   20 +-
 unit_test/basictypes_test.cc       |   32 +-
 unit_test/color_test.cc            |  241 +--
 unit_test/compare_test.cc          |  102 +-
 unit_test/convert_argb_test.cc     |  543 +++---
 unit_test/convert_test.cc          |  534 +++--
 unit_test/cpu_test.cc              |   30 +-
 unit_test/cpu_thread_test.cc       |    6 +-
 unit_test/math_test.cc             |   72 +-
 unit_test/planar_test.cc           | 1028 +++++-----
 unit_test/rotate_argb_test.cc      |   20 +-
 unit_test/rotate_test.cc           |   40 +-
 unit_test/scale_argb_test.cc       |  100 +-
 unit_test/scale_plane_test.cc      |  237 +--
 unit_test/scale_rgb_test.cc        |   24 +-
 unit_test/scale_test.cc            |   56 +-
 unit_test/scale_uv_test.cc         |   24 +-
 unit_test/unit_test.cc             |    3 +
 unit_test/unit_test.h              |    5 +-
 unit_test/video_common_test.cc     |  136 +-
 util/cpuid.c                       |   35 +
 util/ssim.cc                       |   34 +-
 69 files changed, 8440 insertions(+), 8659 deletions(-)

diff --git a/Android.mk b/Android.mk
index a5fb72f63..c83bdb7ff 100644
--- a/Android.mk
+++ b/Android.mk
@@ -1,7 +1,4 @@
 # This is the Android makefile for libyuv for NDK.
-
-# Ignore this file during non-NDK builds.
-ifdef NDK_ROOT
 LOCAL_PATH:= $(call my-dir)
 
 include $(CLEAR_VARS)
@@ -107,4 +104,3 @@ LOCAL_SRC_FILES := \
 
 LOCAL_MODULE := libyuv_unittest
 include $(BUILD_NATIVE_TEST)
-endif  # NDK_ROOT
diff --git a/BUILD.gn b/BUILD.gn
index 2288e24a5..0c0749998 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -22,6 +22,13 @@ declare_args() {
 
 config("libyuv_config") {
   include_dirs = [ "include" ]
+  if (is_android) {
+    if (target_cpu == "arm" || target_cpu == "x86") {
+      ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ]
+    } else {
+      ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ]
+    }
+  }
 
   # Define CHROMIUM to tell cpu_id to avoid sandbox unsafe system calls.
   defines = [ "CHROMIUM" ]
diff --git a/README.chromium b/README.chromium
index cc424502a..92d44bc8c 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1948
+Version: 1937
 Revision: DEPS
 License: BSD-3-Clause
 License File: LICENSE
diff --git a/docs/environment_variables.md b/docs/environment_variables.md
index 3905d65cc..02c04e61b 100644
--- a/docs/environment_variables.md
+++ b/docs/environment_variables.md
@@ -33,6 +33,7 @@ By default the cpu is detected and the most advanced form of SIMD is used.  But
     LIBYUV_DISABLE_AVXVNNI
     LIBYUV_DISABLE_AVXVNNIINT8
     LIBYUV_DISABLE_AMXINT8
+    LIBYUV_DISABLE_AVX512BMM
 
 ## Arm CPUs
 
diff --git a/docs/getting_started.md b/docs/getting_started.md
index 06160bb20..6f5593576 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -72,6 +72,7 @@ Additional commonly used compiler options can be passed to Bazel via `--copt`:
 
     bazel build -c opt --config=android_arm64 \
         --copt=-DLIBYUV_UNLIMITED_DATA \
+        --copt=-DLIBYUV_BIT_EXACT=1 \
         --copt=-DENABLE_ROW_TESTS \
         //:libyuv_test
 
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
index 4c4f8f1f9..662337750 100644
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -888,45 +888,6 @@ int ABGRToI420(const uint8_t* src_abgr,
                int width,
                int height);
 
-// BGRA little endian (argb in memory) to I422.
-LIBYUV_API
-int BGRAToI422(const uint8_t* src_bgra,
-               int src_stride_bgra,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// ABGR little endian (rgba in memory) to I422.
-LIBYUV_API
-int ABGRToI422(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// RGBA little endian (abgr in memory) to I422.
-LIBYUV_API
-int RGBAToI422(const uint8_t* src_rgba,
-               int src_stride_rgba,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
 // RGBA little endian (abgr in memory) to I420.
 LIBYUV_API
 int RGBAToI420(const uint8_t* src_rgba,
diff --git a/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h
index d9fac50c9..e8a8d6a4d 100644
--- a/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@@ -245,19 +245,6 @@ int ARGBToI422(const uint8_t* src_argb,
                int width,
                int height);
 
-// Convert ABGR To I422.
-LIBYUV_API
-int ABGRToI422(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
 // RGB to I444 with matrix. See ArgbConstants at the top of this file for usage.
 LIBYUV_API
 int ARGBToI422Matrix(const uint8_t* src_argb,
diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h
index 61a934ce2..c6983fb32 100644
--- a/include/libyuv/cpu_id.h
+++ b/include/libyuv/cpu_id.h
@@ -60,6 +60,7 @@ static const int kCpuHasAVX10_2 = 0x2000000;
 static const int kCpuHasAVXVNNI = 0x4000000;
 static const int kCpuHasAVXVNNIINT8 = 0x8000000;
 static const int kCpuHasAMXINT8 = 0x10000000;
+static const int kCpuHasAVX512BMM = 0x20000000;
 
 // These flags are only valid on LOONGARCH processors.
 static const int kCpuHasLOONGARCH = 0x20;
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 835342acd..3072d8ff9 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -23,11 +23,10 @@ extern "C" {
 #endif
 
 // This module is for Visual C 32/64 bit
-#if !defined(LIBYUV_DISABLE_X86) &&                                 \
-    (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || \
-     defined(_M_X86))
-#if ((defined(_MSC_VER) && !defined(__clang__)) || \
-     defined(LIBYUV_ENABLE_ROWWIN))
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || defined(__i386__) || \
+     defined(_M_X64) || defined(_M_X86))
+#if ((defined(_MSC_VER) && !defined(__clang__)) || defined(LIBYUV_ENABLE_ROWWIN))
 #define USE_ROW_WIN
 #else
 #define USE_ROW_GCC
@@ -37,33 +36,51 @@ extern "C" {
 // The following are available on clang x86 platforms:
 #if defined(USE_ROW_GCC)
 // Conversions:
+#define HAS_ARGB1555TOARGBROW_SSE2
+#define HAS_ARGB4444TOARGBROW_SSE2
 #define HAS_ARGBEXTRACTALPHAROW_SSE2
 #define HAS_ARGBSETROW_X86
 #define HAS_ARGBSHUFFLEROW_SSSE3
+#define HAS_ARGBTOARGB1555ROW_SSE2
+#define HAS_ARGBTOARGB4444ROW_SSE2
 #define HAS_ARGBTORAWROW_SSSE3
 #define HAS_ARGBTORGB24ROW_SSSE3
+#define HAS_ARGBTORGB565DITHERROW_SSE2
+#define HAS_ARGBTORGB565ROW_SSE2
 #define HAS_COPYROW_ERMS
 #define HAS_COPYROW_SSE2
+#define HAS_H422TOARGBROW_SSSE3
+#define HAS_I422TOARGB1555ROW_SSSE3
+#define HAS_I422TOARGB4444ROW_SSSE3
 #define HAS_I422TOARGBROW_SSSE3
 #define HAS_I422TORGB24ROW_SSSE3
-#define HAS_I422TORGBBAROW_SSSE3
+#define HAS_I422TORGB565ROW_SSSE3
+#define HAS_I422TORGBAROW_SSSE3
 #define HAS_I422TOUYVYROW_SSE2
 #define HAS_I422TOYUY2ROW_SSE2
 #define HAS_I444TOARGBROW_SSSE3
 #define HAS_I444TORGB24ROW_SSSE3
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_J400TOARGBROW_SSE2
+#define HAS_J422TOARGBROW_SSSE3
 #define HAS_MERGEUVROW_SSE2
 #define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORSPLITUVROW_SSSE3
 #define HAS_NV12TOARGBROW_SSSE3
 #define HAS_NV12TORGB24ROW_SSSE3
+#define HAS_NV12TORGB565ROW_SSSE3
 #define HAS_NV21TOARGBROW_SSSE3
 #define HAS_NV21TORGB24ROW_SSSE3
 #define HAS_RAWTOARGBROW_SSSE3
 #define HAS_RAWTORGB24ROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB565TOARGBROW_SSE2
 #define HAS_SETROW_ERMS
 #define HAS_SETROW_X86
 #define HAS_SPLITUVROW_SSE2
 #define HAS_UYVYTOARGBROW_SSSE3
+#define HAS_UYVYTOUV422ROW_SSE2
+#define HAS_UYVYTOUVROW_SSE2
 #define HAS_UYVYTOYROW_SSE2
 #define HAS_YUY2TOARGBROW_SSSE3
 #define HAS_YUY2TOUV422ROW_SSE2
@@ -122,25 +139,14 @@ extern "C" {
 
 // The following are available on all x86 platforms, but
 // require VS2012, clang 3.4 or gcc 4.7.
-#if !defined(LIBYUV_DISABLE_X86) &&                                 \
-    (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || \
-     defined(_M_X86))
-#define HAS_ARGBMIRRORROW_AVX2
-#define HAS_RGB24MIRRORROW_AVX2
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || defined(__i386__) || \
+     defined(_M_X64) || defined(_M_X86))
 #define HAS_ARGBTOUVMATRIXROW_AVX2
-#define HAS_RGBTOUVMATRIXROW_AVX2
-#define HAS_RGB565TOUVMATRIXROW_AVX2
-#define HAS_ARGB1555TOUVMATRIXROW_AVX2
-#define HAS_ARGB4444TOUVMATRIXROW_AVX2
 #define HAS_MERGEUVROW_AVX2
-#define HAS_MIRRORROW_AVX2
-#define HAS_MIRRORSPLITUVROW_AVX2
-#define HAS_MIRRORUVROW_AVX2
-#define HAS_INTERPOLATEROW_16_AVX2
-#define HAS_INTERPOLATEROW_AVX2
 #endif
 
-#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) &&  \
+#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
     (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
      defined(GCC_HAS_AVX2))
 #define HAS_ARGBCOPYALPHAROW_AVX2
@@ -151,21 +157,28 @@ extern "C" {
 #define HAS_ARGBSHUFFLEROW_AVX2
 #define HAS_ARGBTORGB565DITHERROW_AVX2
 #define HAS_COPYROW_AVX
+#define HAS_H422TOARGBROW_AVX2
 #define HAS_HALFFLOATROW_AVX2
+#define HAS_I422TOARGB1555ROW_AVX2
+#define HAS_I422TOARGB4444ROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
 #define HAS_I422TORGB24ROW_AVX2
+#define HAS_I422TORGB565ROW_AVX2
 #define HAS_I422TORGBAROW_AVX2
 #define HAS_I444TOARGBROW_AVX2
 #define HAS_I444TORGB24ROW_AVX2
-#define HAS_J400TOARGBROW_AVX2
+#define HAS_INTERPOLATEROW_AVX2
+#define HAS_J422TOARGBROW_AVX2
 #define HAS_MIRRORROW_AVX2
-#define HAS_MIRRORSPLITUVROW_AVX2
 #define HAS_NV12TOARGBROW_AVX2
 #define HAS_NV12TORGB24ROW_AVX2
+#define HAS_NV12TORGB565ROW_AVX2
 #define HAS_NV21TOARGBROW_AVX2
 #define HAS_NV21TORGB24ROW_AVX2
 #define HAS_SPLITUVROW_AVX2
 #define HAS_UYVYTOARGBROW_AVX2
+#define HAS_UYVYTOUV422ROW_AVX2
+#define HAS_UYVYTOUVROW_AVX2
 #define HAS_UYVYTOYROW_AVX2
 #define HAS_YUY2TOARGBROW_AVX2
 #define HAS_YUY2TOUV422ROW_AVX2
@@ -184,7 +197,7 @@ extern "C" {
 // The following are available for gcc/clang x86 platforms:
 // TODO(fbarchard): Port to Visual C
 #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
-    (defined(__x86_64__) || defined(__i386__)) &&           \
+    (defined(__x86_64__) || defined(__i386__)) && \
     !defined(LIBYUV_ENABLE_ROWWIN)
 #define HAS_AB64TOARGBROW_SSSE3
 #define HAS_ABGRTOAR30ROW_SSSE3
@@ -222,11 +235,8 @@ extern "C" {
 #define HAS_P410TOAR30ROW_SSSE3
 #define HAS_P410TOARGBROW_SSSE3
 #define HAS_RAWTOARGBROW_AVX2
-#define HAS_RGB24TOARGBROW_AVX2
-#define HAS_RGB565TOARGBROW_AVX2
-#define HAS_ARGB1555TOARGBROW_AVX2
-#define HAS_ARGB4444TOARGBROW_AVX2
 #define HAS_RAWTORGBAROW_SSSE3
+#define HAS_RGB24MIRRORROW_SSSE3
 #define HAS_RGBATOYJROW_SSSE3
 #define HAS_SPLITARGBROW_SSE2
 #define HAS_SPLITARGBROW_SSSE3
@@ -241,11 +251,16 @@ extern "C" {
 #define HAS_ARGBTOYROW_SSSE3
 #define HAS_ARGBTOYMATRIXROW_SSSE3
 #define HAS_BGRATOYROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_RGBATOYROW_SSSE3
 
 // TODO: adjust row_win to use 8 bit negative coefficients.
 #define HAS_ABGRTOUVJROW_SSSE3
 #define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ABGRTOUVROW_SSSE3
 #define HAS_ARGBTOUVROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_RGBATOUVROW_SSSE3
 #define HAS_ARGBTOUVMATRIXROW_SSSE3
 #define HAS_ARGBTOUV444MATRIXROW_SSSE3
 
@@ -260,8 +275,8 @@ extern "C" {
 // The following are available for AVX2 gcc/clang x86 platforms:
 // TODO(fbarchard): Port to Visual C
 #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
-    (defined(__x86_64__) || defined(__i386__)) &&           \
-    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) &&   \
+    (defined(__x86_64__) || defined(__i386__)) &&         \
+    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) && \
     !defined(LIBYUV_ENABLE_ROWWIN)
 #define HAS_AB64TOARGBROW_AVX2
 #define HAS_ABGRTOAR30ROW_AVX2
@@ -281,18 +296,10 @@ extern "C" {
 #define HAS_ARGBTOUVJROW_AVX2
 #define HAS_ARGBTOUVROW_AVX2
 #define HAS_ARGBTOUVMATRIXROW_AVX2
-#define HAS_RGBTOUVMATRIXROW_AVX2
-#define HAS_RGB565TOUVMATRIXROW_AVX2
-#define HAS_ARGB1555TOUVMATRIXROW_AVX2
-#define HAS_ARGB4444TOUVMATRIXROW_AVX2
 #define HAS_ARGBTOUV444MATRIXROW_AVX2
 #define HAS_ARGBTOYJROW_AVX2
 #define HAS_ARGBTOYROW_AVX2
 #define HAS_ARGBTOYMATRIXROW_AVX2
-#define HAS_RGBTOYMATRIXROW_AVX2
-#define HAS_RGB565TOYMATRIXROW_AVX2
-#define HAS_ARGB1555TOYMATRIXROW_AVX2
-#define HAS_ARGB4444TOYMATRIXROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
@@ -343,34 +350,24 @@ extern "C" {
 #endif
 
 // This module is for Visual C 32/64 bit
-#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_WIN) &&         \
-    (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || \
-     defined(_M_X86)) &&                                            \
-    ((defined(_MSC_VER) && !defined(__clang__)) ||                  \
+#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_WIN) && \
+    (defined(__x86_64__) || defined(__i386__) || \
+     defined(_M_X64) || defined(_M_X86)) && \
+    ((defined(_MSC_VER) && !defined(__clang__)) || \
      defined(LIBYUV_ENABLE_ROWWIN))
 #define HAS_RAWTOARGBROW_AVX2
-#define HAS_RGB24TOARGBROW_AVX2
-#define HAS_RGB565TOARGBROW_AVX2
-#define HAS_ARGB1555TOARGBROW_AVX2
-#define HAS_ARGB4444TOARGBROW_AVX2
-#define HAS_ARGBSHUFFLEROW_AVX2
 #if defined(__x86_64__) || defined(_M_X64)
 #define HAS_RAWTOARGBROW_AVX512BW
 #define HAS_RGB24TOARGBROW_AVX512BW
-#define HAS_ARGBSHUFFLEROW_AVX512BW
 #endif
 #define HAS_ARGBTOYROW_AVX2
 #define HAS_ARGBTOYMATRIXROW_AVX2
-#define HAS_RGBTOYMATRIXROW_AVX2
-#define HAS_RGB565TOYMATRIXROW_AVX2
-#define HAS_ARGB1555TOYMATRIXROW_AVX2
-#define HAS_ARGB4444TOYMATRIXROW_AVX2
-#define HAS_ARGBTOUV444MATRIXROW_AVX2
 #define HAS_ABGRTOYROW_AVX2
 #define HAS_ARGBTOYJROW_AVX2
 #define HAS_ABGRTOYJROW_AVX2
 #define HAS_RGBATOYJROW_AVX2
-#define HAS_J400TOARGBROW_AVX2
+#define HAS_RGBATOYROW_AVX2
+#define HAS_BGRATOYROW_AVX2
 #endif
 
 // The following are available for AVX512 clang x86 platforms:
@@ -386,6 +383,7 @@ extern "C" {
 #endif
 #define HAS_ARGBTORGB24ROW_AVX512VBMI
 #define HAS_CONVERT16TO8ROW_AVX512BW
+#define HAS_MERGEUVROW_AVX512BW
 #endif
 
 // The following are available for AVX512 clang x64 platforms:
@@ -393,23 +391,14 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) && \
     (defined(CLANG_HAS_AVX512))
 #define HAS_I422TOARGBROW_AVX512BW
-#define HAS_ARGBSHUFFLEROW_AVX512BW
 #define HAS_ARGBTOUV444ROW_AVX512BW
 #define HAS_ARGBTOUV444MATRIXROW_AVX512BW
 #define HAS_ARGBTOYROW_AVX512BW
 #define HAS_ARGBTOYMATRIXROW_AVX512BW
-#define HAS_I422TORGB24ROW_AVX512VBMI
-#define HAS_I422TORGB24ROW_AVX512BW
 #define HAS_ARGBTOUVJ444ROW_AVX512BW
 #define HAS_ARGBTOUVROW_AVX512BW
 #define HAS_ARGBTOUVJROW_AVX512BW
 #define HAS_ARGBTOUVMATRIXROW_AVX512BW
-#define HAS_J400TOARGBROW_AVX512BW
-#define HAS_MERGEUVROW_AVX512BW
-#define HAS_MIRRORROW_AVX512BW
-#define HAS_MIRRORSPLITUVROW_AVX512BW
-#define HAS_SPLITUVROW_AVX512BW
-#define HAS_RGBTOUVMATRIXROW_AVX512BW
 #endif
 
 // The following are available on Neon platforms:
@@ -445,21 +434,14 @@ extern "C" {
 #define HAS_ARGBTOUVJROW_NEON
 #if !defined(__GNUC__) || defined(__clang__)
 #define HAS_ARGBTOUVMATRIXROW_NEON
-#define HAS_RGBTOUVMATRIXROW_NEON
-#define HAS_RGB565TOUVMATRIXROW_NEON
-#define HAS_ARGB1555TOUVMATRIXROW_NEON
-#define HAS_ARGB4444TOUVMATRIXROW_NEON
 #endif
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
 #if !defined(__aarch64__)
 #define HAS_ARGBTOYMATRIXROW_NEON
-#define HAS_RGB565TOYMATRIXROW_NEON
-#define HAS_ARGB1555TOYMATRIXROW_NEON
-#define HAS_ARGB4444TOYMATRIXROW_NEON
-#define HAS_RGBTOYMATRIXROW_NEON
 #endif
 #define HAS_ARGBTOYROW_NEON
+#define HAS_AYUVTOUVROW_NEON
 #define HAS_AYUVTOVUROW_NEON
 #define HAS_AYUVTOYROW_NEON
 #define HAS_BGRATOUVROW_NEON
@@ -533,6 +515,7 @@ extern "C" {
 #define HAS_SWAPUVROW_NEON
 #define HAS_UNPACKMT2T_NEON
 #define HAS_UYVYTOARGBROW_NEON
+#define HAS_UYVYTOUV422ROW_NEON
 #define HAS_UYVYTOUVROW_NEON
 #define HAS_UYVYTOYROW_NEON
 #define HAS_YUY2TOARGBROW_NEON
@@ -599,7 +582,6 @@ extern "C" {
 #define HAS_ARGBTOUVJ444ROW_NEON_I8MM
 #define HAS_ARGBTOUVJROW_NEON_I8MM
 #define HAS_ARGBTOUVMATRIXROW_NEON_I8MM
-#define HAS_RGBTOUVMATRIXROW_NEON
 #define HAS_ARGBTOUVROW_NEON_I8MM
 #define HAS_BGRATOUVROW_NEON_I8MM
 #define HAS_RGBATOUVROW_NEON_I8MM
@@ -1050,13 +1032,10 @@ struct ArgbConstants {
 
 #endif
 
-#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
 
 #define align_buffer_64(var, size)                                         \
-  size_t var##_mem_size = (size); /* NOLINT */                             \
-  void* var##_mem = (var##_mem_size > SIZE_MAX - 63)                       \
-                        ? NULL                                             \
-                        : malloc(var##_mem_size + 63);        /* NOLINT */ \
+  void* var##_mem = malloc((size) + 63);                      /* NOLINT */ \
   uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */
 
 #define free_aligned_buffer_64(var) \
@@ -1106,17 +1085,26 @@ struct ArgbConstants {
 #define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B");
 
 #else /* Visual C */
-#define IACA_UD_BYTES {__asm _emit 0x0F __asm _emit 0x0B}
+#define IACA_UD_BYTES \
+  { __asm _emit 0x0F __asm _emit 0x0B }
 
 #define IACA_SSC_MARK(x) \
-  {__asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90}
+  { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 }
 
 #define IACA_VC64_START __writegsbyte(111, 111);
 #define IACA_VC64_END __writegsbyte(222, 222);
 #endif
 
-#define IACA_START {IACA_UD_BYTES IACA_SSC_MARK(111)}
-#define IACA_END {IACA_SSC_MARK(222) IACA_UD_BYTES}
+#define IACA_START     \
+  {                    \
+    IACA_UD_BYTES      \
+    IACA_SSC_MARK(111) \
+  }
+#define IACA_END       \
+  {                    \
+    IACA_SSC_MARK(222) \
+    IACA_UD_BYTES      \
+  }
 
 void I210AlphaToARGBRow_NEON(const uint16_t* src_y,
                              const uint16_t* src_u,
@@ -1828,9 +1816,9 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
                                int width,
                                const struct ArgbConstants* c);
 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
-                           uint8_t* dst_y,
-                           int width,
-                           const struct ArgbConstants* c);
+                            uint8_t* dst_y,
+                            int width,
+                            const struct ArgbConstants* c);
 void ARGBToUV444MatrixRow_Any_NEON(const uint8_t* src_argb,
                                    uint8_t* dst_u,
                                    uint8_t* dst_v,
@@ -2180,76 +2168,12 @@ void ARGBToUVMatrixRow_C(const uint8_t* src_argb,
                          uint8_t* dst_v,
                          int width,
                          const struct ArgbConstants* c);
-void RGBToYMatrixRow_C(const uint8_t* src_rgb,
-                       uint8_t* dst_y,
-                       int width,
-                       const struct ArgbConstants* c);
-void RGBToUVMatrixRow_C(const uint8_t* src_rgb,
-                        int src_stride_rgb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width,
-                        const struct ArgbConstants* c);
-void RGB565ToYMatrixRow_C(const uint8_t* src_rgb565,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c);
-void ARGB1555ToYMatrixRow_C(const uint8_t* src_argb1555,
-                            uint8_t* dst_y,
-                            int width,
-                            const struct ArgbConstants* c);
-void ARGB1555ToUVMatrixRow_C(const uint8_t* src_argb1555,
-                             int src_stride_argb1555,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width,
-                             const struct ArgbConstants* c);
-void ARGB4444ToYMatrixRow_C(const uint8_t* src_argb4444,
-                            uint8_t* dst_y,
-                            int width,
-                            const struct ArgbConstants* c);
-void ARGB4444ToUVMatrixRow_C(const uint8_t* src_argb4444,
-                             int src_stride_argb4444,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width,
-                             const struct ArgbConstants* c);
-void RGB565ToUVMatrixRow_C(const uint8_t* src_rgb565,
-                           int src_stride_rgb565,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width,
-                           const struct ArgbConstants* c);
 void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
                              int src_stride_argb,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
                              int width,
                              const struct ArgbConstants* c);
-void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb,
-                           int src_stride_rgb,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width,
-                           const struct ArgbConstants* c);
-void RGBToUVMatrixRow_Any_AVX2(const uint8_t* src_rgb,
-                               int src_stride_rgb,
-                               uint8_t* dst_u,
-                               uint8_t* dst_v,
-                               int width,
-                               const struct ArgbConstants* c);
-void RGBToUVMatrixRow_AVX512BW(const uint8_t* src_rgb,
-                               int src_stride_rgb,
-                               uint8_t* dst_u,
-                               uint8_t* dst_v,
-                               int width,
-                               const struct ArgbConstants* c);
-void RGBToUVMatrixRow_Any_AVX512BW(const uint8_t* src_rgb,
-                                   int src_stride_rgb,
-                                   uint8_t* dst_u,
-                                   uint8_t* dst_v,
-                                   int width,
-                                   const struct ArgbConstants* c);
 void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
                             int src_stride_argb,
                             uint8_t* dst_u,
@@ -2271,135 +2195,6 @@ void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb,
                             uint8_t* dst_y,
                             int width,
                             const struct ArgbConstants* c);
-void RGBToYMatrixRow_AVX2(const uint8_t* src_rgb,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c);
-void RGBToYMatrixRow_Any_AVX2(const uint8_t* src_rgb,
-                              uint8_t* dst_y,
-                              int width,
-                              const struct ArgbConstants* c);
-void RGB565ToYMatrixRow_AVX2(const uint8_t* src_rgb565,
-                             uint8_t* dst_y,
-                             int width,
-                             const struct ArgbConstants* c);
-void RGB565ToYMatrixRow_Any_AVX2(const uint8_t* src_rgb565,
-                                 uint8_t* dst_y,
-                                 int width,
-                                 const struct ArgbConstants* c);
-void ARGB1555ToYMatrixRow_AVX2(const uint8_t* src_argb1555,
-                               uint8_t* dst_y,
-                               int width,
-                               const struct ArgbConstants* c);
-void ARGB1555ToYMatrixRow_Any_AVX2(const uint8_t* src_argb1555,
-                                   uint8_t* dst_y,
-                                   int width,
-                                   const struct ArgbConstants* c);
-void ARGB1555ToUVMatrixRow_AVX2(const uint8_t* src_argb1555,
-                                int src_stride_argb1555,
-                                uint8_t* dst_u,
-                                uint8_t* dst_v,
-                                int width,
-                                const struct ArgbConstants* c);
-void ARGB1555ToUVMatrixRow_Any_AVX2(const uint8_t* src_argb1555,
-                                    int src_stride_argb1555,
-                                    uint8_t* dst_u,
-                                    uint8_t* dst_v,
-                                    int width,
-                                    const struct ArgbConstants* c);
-void ARGB4444ToYMatrixRow_AVX2(const uint8_t* src_argb4444,
-                               uint8_t* dst_y,
-                               int width,
-                               const struct ArgbConstants* c);
-void ARGB4444ToYMatrixRow_Any_AVX2(const uint8_t* src_argb4444,
-                                   uint8_t* dst_y,
-                                   int width,
-                                   const struct ArgbConstants* c);
-void ARGB4444ToUVMatrixRow_AVX2(const uint8_t* src_argb4444,
-                                int src_stride_argb4444,
-                                uint8_t* dst_u,
-                                uint8_t* dst_v,
-                                int width,
-                                const struct ArgbConstants* c);
-void ARGB4444ToUVMatrixRow_Any_AVX2(const uint8_t* src_argb4444,
-                                    int src_stride_argb4444,
-                                    uint8_t* dst_u,
-                                    uint8_t* dst_v,
-                                    int width,
-                                    const struct ArgbConstants* c);
-void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565,
-                              int src_stride_rgb565,
-                              uint8_t* dst_u,
-                              uint8_t* dst_v,
-                              int width,
-                              const struct ArgbConstants* c);
-void RGB565ToUVMatrixRow_Any_AVX2(const uint8_t* src_rgb565,
-                                  int src_stride_rgb565,
-                                  uint8_t* dst_u,
-                                  uint8_t* dst_v,
-                                  int width,
-                                  const struct ArgbConstants* c);
-void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565,
-                             uint8_t* dst_y,
-                             int width,
-                             const struct ArgbConstants* c);
-void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555,
-                               uint8_t* dst_y,
-                               int width,
-                               const struct ArgbConstants* c);
-void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555,
-                                int src_stride_argb1555,
-                                uint8_t* dst_u,
-                                uint8_t* dst_v,
-                                int width,
-                                const struct ArgbConstants* c);
-void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444,
-                               uint8_t* dst_y,
-                               int width,
-                               const struct ArgbConstants* c);
-void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444,
-                                int src_stride_argb4444,
-                                uint8_t* dst_u,
-                                uint8_t* dst_v,
-                                int width,
-                                const struct ArgbConstants* c);
-void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565,
-                              int src_stride_rgb565,
-                              uint8_t* dst_u,
-                              uint8_t* dst_v,
-                              int width,
-                              const struct ArgbConstants* c);
-void RGB565ToYMatrixRow_Any_NEON(const uint8_t* src_rgb565,
-                                 uint8_t* dst_y,
-                                 int width,
-                                 const struct ArgbConstants* c);
-void ARGB1555ToYMatrixRow_Any_NEON(const uint8_t* src_argb1555,
-                                   uint8_t* dst_y,
-                                   int width,
-                                   const struct ArgbConstants* c);
-void ARGB1555ToUVMatrixRow_Any_NEON(const uint8_t* src_argb1555,
-                                    int src_stride_argb1555,
-                                    uint8_t* dst_u,
-                                    uint8_t* dst_v,
-                                    int width,
-                                    const struct ArgbConstants* c);
-void ARGB4444ToYMatrixRow_Any_NEON(const uint8_t* src_argb4444,
-                                   uint8_t* dst_y,
-                                   int width,
-                                   const struct ArgbConstants* c);
-void ARGB4444ToUVMatrixRow_Any_NEON(const uint8_t* src_argb4444,
-                                    int src_stride_argb4444,
-                                    uint8_t* dst_u,
-                                    uint8_t* dst_v,
-                                    int width,
-                                    const struct ArgbConstants* c);
-void RGB565ToUVMatrixRow_Any_NEON(const uint8_t* src_rgb565,
-                                  int src_stride_rgb565,
-                                  uint8_t* dst_u,
-                                  uint8_t* dst_v,
-                                  int width,
-                                  const struct ArgbConstants* c);
-
 void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
                            uint8_t* dst_y,
                            int width,
@@ -2426,23 +2221,6 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                           int width,
                           const struct ArgbConstants* c);
 
-void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb,
-                           int src_stride_rgb,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width,
-                           const struct ArgbConstants* c);
-void RGBToYMatrixRow_Any_NEON(const uint8_t* src_rgb,
-                              uint8_t* dst_y,
-                              int width,
-                              const struct ArgbConstants* c);
-void RGBToUVMatrixRow_Any_NEON(const uint8_t* src_rgb,
-                               int src_stride_rgb,
-                               uint8_t* dst_u,
-                               uint8_t* dst_v,
-                               int width,
-                               const struct ArgbConstants* c);
-
 void ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb,
                                    uint8_t* dst_y,
                                    int width,
@@ -2473,6 +2251,7 @@ void ARGBToYMatrixRow_Any_LASX(const uint8_t* src_argb,
                                int width,
                                const struct ArgbConstants* c);
 
+
 void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
                                 uint8_t* dst_u,
                                 uint8_t* dst_v,
@@ -2530,29 +2309,15 @@ void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
 void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
-void ARGBToYRow_Any_AVX512BW(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void ARGBToYJRow_Any_AVX512BW(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
-void ABGRToYRow_Any_AVX512BW(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void ABGRToYJRow_Any_AVX512BW(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
+void ARGBToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGBAToYRow_Any_AVX512BW(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void RGBAToYJRow_Any_AVX512BW(const uint8_t* src_ptr,
-                              uint8_t* dst_ptr,
-                              int width);
+void RGBAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void BGRAToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void BGRAToYRow_Any_AVX512BW(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
+void BGRAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -3152,16 +2917,12 @@ void ARGBToUVJ444Row_C(const uint8_t* src_argb,
                        uint8_t* dst_v,
                        int width);
 
-void MirrorRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
-void MirrorRow_Any_AVX512BW(const uint8_t* src_ptr,
-                            uint8_t* dst_ptr,
-                            int width);
 void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
@@ -3175,18 +2936,15 @@ void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 
-void MirrorSplitUVRow_AVX512BW(const uint8_t* src,
-                               uint8_t* dst_u,
-                               uint8_t* dst_v,
-                               int width);
-void MirrorSplitUVRow_AVX2(const uint8_t* src,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width);
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
 void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
@@ -3222,16 +2980,16 @@ void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
 
-void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
-                         uint8_t* dst_rgb24,
-                         int width);
+void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_rgb24,
+                          int width);
 void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
                          uint8_t* dst_rgb24,
                          int width);
 void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width);
-void RGB24MirrorRow_Any_AVX2(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
+void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
 void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
@@ -3244,10 +3002,6 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void SplitUVRow_AVX512BW(const uint8_t* src_uv,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width);
 void SplitUVRow_AVX2(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
@@ -3264,10 +3018,6 @@ void SplitUVRow_RVV(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
-void SplitUVRow_Any_AVX512BW(const uint8_t* src_ptr,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width);
 void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -4242,10 +3992,6 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
                          int width);
-void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             const uint8_t* shuffler,
-                             int width);
 void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
@@ -4266,10 +4012,6 @@ void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              const uint8_t* param,
                              int width);
-void ARGBShuffleRow_Any_AVX512BW(const uint8_t* src_ptr,
-                                 uint8_t* dst_ptr,
-                                 const uint8_t* param,
-                                 int width);
 void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              const uint8_t* param,
@@ -4288,15 +4030,12 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
                           int width);
 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width);
-void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24,
-                         uint8_t* dst_argb,
-                         int width);
-void RAWToARGBRow_AVX512BW(const uint8_t* src_raw,
-                           uint8_t* dst_argb,
-                           int width);
+void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
-
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
                           uint8_t* dst_argb,
                           int width);
@@ -4383,18 +4122,9 @@ void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
 void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
-                             uint8_t* dst_ptr,
-                             int width);
-void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr,
-                               uint8_t* dst_ptr,
-                               int width);
-void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24,
-                             uint8_t* dst_argb,
-                             int width);
-void RGB24ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr,
-                                 uint8_t* dst_ptr,
-                                 int width);
+void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
@@ -4402,6 +4132,15 @@ void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
 
+void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
 void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int width);
@@ -4460,7 +4199,9 @@ void ARGB4444ToARGBRow_Any_LASX(const uint8_t* src_ptr,
 
 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
-
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
 
@@ -4473,7 +4214,10 @@ void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
                              uint8_t* dst_rgb,
                              uint32_t dither4,
                              int width);
-
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+                                uint8_t* dst,
+                                uint32_t dither4,
+                                int width);
 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                 uint8_t* dst,
                                 uint32_t dither4,
@@ -4635,15 +4379,15 @@ void AB64ToARGBRow_Any_NEON(const uint16_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
 
-void J400ToARGBRow_AVX512BW(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void J400ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr,
-                                uint8_t* dst_ptr,
-                                int width);
+void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
@@ -4858,12 +4602,6 @@ void I444ToARGBRow_AVX2(const uint8_t* y_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I444ToARGBRow_AVX512BW(const uint8_t* y_buf,
-                            const uint8_t* u_buf,
-                            const uint8_t* v_buf,
-                            uint8_t* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
 void I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
                           const uint8_t* u_buf,
                           const uint8_t* v_buf,
@@ -5043,7 +4781,11 @@ void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
                           uint8_t* dst_rgb24,
                           const struct YuvConstants* yuvconstants,
                           int width);
-
+void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_uv,
+                           uint8_t* dst_rgb565,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
 void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
                          const uint8_t* src_uv,
                          uint8_t* dst_rgb24,
@@ -5062,7 +4804,11 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
                          const uint8_t* src_vu,
                          uint8_t* dst_yuv24,
                          int width);
-
+void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
 void NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
                          const uint8_t* vu_buf,
                          uint8_t* dst_argb,
@@ -5137,7 +4883,42 @@ void I422ToRGBARow_SSSE3(const uint8_t* y_buf,
                          uint8_t* dst_rgba,
                          const struct YuvConstants* yuvconstants,
                          int width);
-
+void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb4444,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb1555,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_rgb565,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
 void I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
                           const uint8_t* u_buf,
                           const uint8_t* v_buf,
@@ -5150,18 +4931,6 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I422ToRGB24Row_AVX512VBMI(const uint8_t* src_y,
-                               const uint8_t* src_u,
-                               const uint8_t* src_v,
-                               uint8_t* dst_rgb24,
-                               const struct YuvConstants* yuvconstants,
-                               int width);
-void I422ToRGB24Row_AVX512BW(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             uint8_t* dst_rgb24,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
 void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf,
                             const uint8_t* u_buf,
                             const uint8_t* v_buf,
@@ -5198,12 +4967,6 @@ void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I444ToARGBRow_Any_AVX512BW(const uint8_t* y_buf,
-                                const uint8_t* u_buf,
-                                const uint8_t* v_buf,
-                                uint8_t* dst_ptr,
-                                const struct YuvConstants* yuvconstants,
-                                int width);
 void I444ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
                              const uint8_t* u_buf,
                              const uint8_t* v_buf,
@@ -5404,7 +5167,16 @@ void NV21ToYUV24Row_Any_AVX2(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
-
+void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
+                               const uint8_t* uv_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
 void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
@@ -5467,7 +5239,42 @@ void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-
+void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_ptr,
+                                 const struct YuvConstants* yuvconstants,
+                                 int width);
+void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
 void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
                               const uint8_t* u_buf,
                               const uint8_t* v_buf,
@@ -5480,18 +5287,6 @@ void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422ToRGB24Row_Any_AVX512VBMI(const uint8_t* y_buf,
-                                   const uint8_t* u_buf,
-                                   const uint8_t* v_buf,
-                                   uint8_t* dst_ptr,
-                                   const struct YuvConstants* yuvconstants,
-                                   int width);
-void I422ToRGB24Row_Any_AVX512BW(const uint8_t* y_buf,
-                                 const uint8_t* u_buf,
-                                 const uint8_t* v_buf,
-                                 uint8_t* dst_ptr,
-                                 const struct YuvConstants* yuvconstants,
-                                 int width);
 
 void I400ToARGBRow_C(const uint8_t* src_y,
                      uint8_t* rgb_buf,
@@ -5746,7 +5541,15 @@ void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
 void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
-
+void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
 void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
@@ -5760,7 +5563,10 @@ void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr,
 void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr,
                                    uint8_t* dst_ptr,
                                    int width);
-
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    const uint32_t param,
+                                    int width);
 void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr,
                                     uint8_t* dst_ptr,
                                     const uint32_t param,
@@ -6819,6 +6625,11 @@ void InterpolateRow_C(uint8_t* dst_ptr,
                       ptrdiff_t src_stride,
                       int width,
                       int source_y_fraction);
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction);
 void InterpolateRow_AVX2(uint8_t* dst_ptr,
                          const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
@@ -6849,6 +6660,11 @@ void InterpolateRow_Any_NEON(uint8_t* dst_ptr,
                              ptrdiff_t src_stride_ptr,
                              int width,
                              int source_y_fraction);
+void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr,
+                              const uint8_t* src_ptr,
+                              ptrdiff_t src_stride_ptr,
+                              int width,
+                              int source_y_fraction);
 void InterpolateRow_Any_AVX2(uint8_t* dst_ptr,
                              const uint8_t* src_ptr,
                              ptrdiff_t src_stride_ptr,
@@ -6865,16 +6681,6 @@ void InterpolateRow_16_C(uint16_t* dst_ptr,
                          ptrdiff_t src_stride,
                          int width,
                          int source_y_fraction);
-void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
-                            const uint16_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            int width,
-                            int source_y_fraction);
-void InterpolateRow_16_Any_AVX2(uint16_t* dst_ptr,
-                                const uint16_t* src_ptr,
-                                ptrdiff_t src_stride,
-                                int width,
-                                int source_y_fraction);
 void InterpolateRow_16_NEON(uint16_t* dst_ptr,
                             const uint16_t* src_ptr,
                             ptrdiff_t src_stride,
diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h
index 280d635b9..f7e2123a7 100644
--- a/include/libyuv/row_sve.h
+++ b/include/libyuv/row_sve.h
@@ -631,8 +631,8 @@ static inline void I422ToRGB565Row_SVE_SC(
       // Calculate a predicate for the final iteration to deal with the tail.
       "cnth     %[vl]                                   \n"
       "whilelt  p1.b, wzr, %w[width]                    \n"  //
-      READYUV422_SVE_2X I422TORGB_SVE_2X
-          RGBTOARGB8_SVE_TOP_2X RGB8TORGB565_SVE_FROM_TOP_2X
+      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
+          RGB8TORGB565_SVE_FROM_TOP_2X
       // Need to permute the data on the final iteration such that the
       // predicates (.b) line up with the 16-bit element data.
       "trn1     z20.b, z18.b, z19.b                     \n"
@@ -694,8 +694,8 @@ static inline void I422ToARGB1555Row_SVE_SC(
       // Calculate a predicate for the final iteration to deal with the tail.
       "cnth     %[vl]                                   \n"
       "whilelt  p1.b, wzr, %w[width]                    \n"  //
-      READYUV422_SVE_2X I422TORGB_SVE_2X
-          RGBTOARGB8_SVE_TOP_2X RGB8TOARGB1555_SVE_FROM_TOP_2X
+      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
+          RGB8TOARGB1555_SVE_FROM_TOP_2X
       "st2h     {z0.h, z1.h}, p1, [%[dst]] \n"
 
       "99:                                              \n"
@@ -753,8 +753,8 @@ static inline void I422ToARGB4444Row_SVE_SC(
       // Calculate a predicate for the final iteration to deal with the tail.
       "cnth     %[vl]                                   \n"
       "whilelt  p1.b, wzr, %w[width]                    \n"  //
-      READYUV422_SVE_2X I422TORGB_SVE_2X
-          RGBTOARGB8_SVE_TOP_2X RGB8TOARGB4444_SVE_FROM_TOP_2X
+      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
+          RGB8TOARGB4444_SVE_FROM_TOP_2X
       "st2h     {z0.h, z1.h}, p1, [%[dst]] \n"
 
       "99:                                              \n"
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 9f9d18da7..f384c1efb 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1948
+#define LIBYUV_VERSION 1937
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/libyuv.gyp b/libyuv.gyp
index fa4b146a4..394840216 100644
--- a/libyuv.gyp
+++ b/libyuv.gyp
@@ -122,6 +122,18 @@
           'include',
           '.',
         ],
+        'conditions': [
+          ['OS == "android" and target_arch == "arm64"', {
+            'ldflags': [
+              '-Wl,--dynamic-linker,/system/bin/linker64',
+            ],
+          }],
+          ['OS == "android" and target_arch != "arm64"', {
+            'ldflags': [
+              '-Wl,--dynamic-linker,/system/bin/linker',
+            ],
+          }],
+        ], #conditions
       },
       'sources': [
         '<@(libyuv_sources)',
diff --git a/libyuv.gypi b/libyuv.gypi
index 5cf173ef3..44b127410 100644
--- a/libyuv.gypi
+++ b/libyuv.gypi
@@ -69,7 +69,6 @@
       'source/row_lsx.cc',
       'source/row_neon.cc',
       'source/row_neon64.cc',
-      'source/row_rvv.cc',
       'source/row_win.cc',
       'source/scale.cc',
       'source/scale_any.cc',
@@ -80,7 +79,6 @@
       'source/scale_neon.cc',
       'source/scale_neon64.cc',
       'source/scale_rgb.cc',
-      'source/scale_rvv.cc',
       'source/scale_uv.cc',
       'source/scale_win.cc',
       'source/video_common.cc',
diff --git a/source/compare.cc b/source/compare.cc
index 10023301c..e85cc6d07 100644
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -11,7 +11,6 @@
 #include "libyuv/compare.h"
 
 #include <float.h>
-#include <limits.h>
 #include <math.h>
 #ifdef _OPENMP
 #include <omp.h>
@@ -107,11 +106,8 @@ uint32_t ARGBDetect(const uint8_t* argb,
   uint32_t fourcc = 0;
   int h;
 
-  if (!argb || width <= 0 || height <= 0) {
-    return fourcc;
-  }
   // Coalesce rows.
-  if (stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+  if (stride_argb == width * 4) {
     width *= height;
     height = 1;
     stride_argb = 0;
@@ -249,12 +245,8 @@ uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
                                     int height) {
   uint64_t sse = 0;
   int h;
-  if (!src_a || !src_b || width <= 0 || height <= 0) {
-    return sse;
-  }
   // Coalesce rows.
-  if (stride_a == width && stride_b == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (stride_a == width && stride_b == width) {
     width *= height;
     height = 1;
     stride_a = stride_b = 0;
diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc
index 36c5e575c..756f83cb3 100644
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@@ -116,7 +116,7 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
   uint32_t hash = seed;
   const uint32_t c16 = 0x92d9e201;  // 33^16
   uint32_t tmp, tmp2;
-  asm("ld1         {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
+      asm("ld1         {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
       "ld1         {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
 
       // count is always a multiple of 16.
diff --git a/source/compare_win.cc b/source/compare_win.cc
index 59374cd8a..9d5bb27cd 100644
--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -41,9 +41,8 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
   return diff;
 }
 
-__declspec(naked) uint32_t SumSquareError_SSE2(const uint8_t* src_a,
-                                               const uint8_t* src_b,
-                                               int count) {
+__declspec(naked) uint32_t
+    SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
   __asm {
     mov        eax, [esp + 4]  // src_a
     mov        edx, [esp + 8]  // src_b
@@ -82,9 +81,8 @@ __declspec(naked) uint32_t SumSquareError_SSE2(const uint8_t* src_a,
 #ifdef HAS_SUMSQUAREERROR_AVX2
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
 #pragma warning(disable : 4752)
-__declspec(naked) uint32_t SumSquareError_AVX2(const uint8_t* src_a,
-                                               const uint8_t* src_b,
-                                               int count) {
+__declspec(naked) uint32_t
+    SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
   __asm {
     mov        eax, [esp + 4]  // src_a
     mov        edx, [esp + 8]  // src_b
@@ -148,9 +146,8 @@ uvec32 kHashMul3 = {
     0x00000001,  // 33 ^ 0
 };
 
-__declspec(naked) uint32_t HashDjb2_SSE41(const uint8_t* src,
-                                          int count,
-                                          uint32_t seed) {
+__declspec(naked) uint32_t
+    HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
   __asm {
     mov        eax, [esp + 4]  // src
     mov        ecx, [esp + 8]  // count
@@ -200,9 +197,8 @@ __declspec(naked) uint32_t HashDjb2_SSE41(const uint8_t* src,
 
 // Visual C 2012 required for AVX2.
 #ifdef HAS_HASHDJB2_AVX2
-__declspec(naked) uint32_t HashDjb2_AVX2(const uint8_t* src,
-                                         int count,
-                                         uint32_t seed) {
+__declspec(naked) uint32_t
+    HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
   __asm {
     mov        eax, [esp + 4]  // src
     mov        ecx, [esp + 8]  // count
diff --git a/source/convert.cc b/source/convert.cc
index fbef68f57..0b90ffaaf 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -10,12 +10,10 @@
 
 #include "libyuv/convert.h"
 
-#include <limits.h>
-
 #include "libyuv/basic_types.h"
-#include "libyuv/convert_from_argb.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
+#include "libyuv/convert_from_argb.h"
 #include "libyuv/rotate.h"
 #include "libyuv/row.h"
 #include "libyuv/scale.h"      // For ScalePlane()
@@ -24,7 +22,6 @@
 
 #ifdef __cplusplus
 namespace libyuv {
-
 extern const struct ArgbConstants kArgbI601Constants;
 extern const struct ArgbConstants kArgbJPEGConstants;
 extern "C" {
@@ -56,16 +53,16 @@ static int I4xxToI420(const uint8_t* src_y,
                       int src_y_height,
                       int src_uv_width,
                       int src_uv_height) {
-  int r;
-  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v ||
-      src_y_width <= 0 || src_y_height == 0 || src_y_height == INT_MIN ||
-      src_uv_width <= 0 || src_uv_height == 0) {
-    return -1;
-  }
   const int dst_y_width = src_y_width;
   const int dst_y_height = Abs(src_y_height);
   const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
   const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  int r;
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v ||
+      src_y_width <= 0 || src_y_height == 0 || src_uv_width <= 0 ||
+      src_uv_height == 0) {
+    return -1;
+  }
   if (dst_y) {
     CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, src_y_width,
               src_y_height);
@@ -99,16 +96,16 @@ int I420Copy(const uint8_t* src_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -142,16 +139,16 @@ int I010Copy(const uint16_t* src_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -183,20 +180,20 @@ static int Planar16bitTo8bit(const uint16_t* src_y,
                              int subsample_x,
                              int subsample_y,
                              int depth) {
-  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
-    return -1;
-  }
   int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
   int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
   int scale = 1 << (24 - depth);
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     uv_height = -uv_height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(uv_height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(uv_height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (uv_height - 1) * src_stride_u;
+    src_v = src_v + (uv_height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -232,15 +229,15 @@ static int I41xToI420(const uint16_t* src_y,
                       int depth) {
   const int scale = 1 << (24 - depth);
 
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -277,15 +274,15 @@ static int I21xToI420(const uint16_t* src_y,
                       int depth) {
   const int scale = 1 << (24 - depth);
 
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -532,17 +529,17 @@ static int Ix10ToI010(const uint16_t* src_y,
                       int height,
                       int subsample_x,
                       int subsample_y) {
-  int r;
-  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
-    return -1;
-  }
   const int dst_y_width = width;
   const int dst_y_height = Abs(height);
   const int src_uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
   const int src_uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
   const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
   const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  int r;
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
   if (dst_y) {
     CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
@@ -612,11 +609,11 @@ static int IxxxToPxxx(const uint16_t* src_y,
                       int subsample_x,
                       int subsample_y,
                       int depth) {
-  if (width <= 0 || height == 0 || height == INT_MIN) {
-    return -1;
-  }
   const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
   const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+  if (width <= 0 || height == 0) {
+    return -1;
+  }
 
   ConvertToMSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
                        depth);
@@ -665,16 +662,16 @@ int I010ToNV12(const uint16_t* src_y,
   void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
                      uint8_t* dst_uv, int width) = MergeUVRow_C;
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_uv || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -888,15 +885,15 @@ int I422ToI210(const uint8_t* src_y,
                int height) {
   int halfwidth = (width + 1) >> 1;
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -927,39 +924,29 @@ int I422ToNV21(const uint8_t* src_y,
                int dst_stride_vu,
                int width,
                int height) {
-  int r;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
-    return -1;
-  }
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
   }
 
   // Allocate u and v buffers
-  const uint64_t plane_size = (uint64_t)halfwidth * halfheight;
-  if (plane_size > SIZE_MAX / 2)
-    return 1;
-  align_buffer_64(plane_u, (size_t)plane_size * 2);
+  align_buffer_64(plane_u, halfwidth * halfheight * 2);
+  uint8_t* plane_v = plane_u + halfwidth * halfheight;
   if (!plane_u)
     return 1;
-  uint8_t* plane_v = plane_u + (size_t)plane_size;
 
-  r = I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
-                 dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth,
-                 width, height);
-  if (r != 0) {
-    return r;
-  }
+  I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+             dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
+             height);
   MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
                halfwidth, halfheight);
   free_aligned_buffer_64(plane_u);
@@ -1053,7 +1040,7 @@ int MT2TToP010(const uint8_t* src_y,
                int dst_stride_uv,
                int width,
                int height) {
-  if (width <= 0 || height == 0 || height == INT_MIN || !src_uv || !dst_uv) {
+  if (width <= 0 || !height || !src_uv || !dst_uv) {
     return -1;
   }
 
@@ -1084,10 +1071,10 @@ int MT2TToP010(const uint8_t* src_y,
       height = -height;
       uv_height = (height + 1) / 2;
       if (dst_y) {
-        dst_y = dst_y + (ptrdiff_t)(height - 1) * dst_stride_y;
+        dst_y = dst_y + (height - 1) * dst_stride_y;
         dst_stride_y = -dst_stride_y;
       }
-      dst_uv = dst_uv + (ptrdiff_t)(uv_height - 1) * dst_stride_uv;
+      dst_uv = dst_uv + (uv_height - 1) * dst_stride_uv;
       dst_stride_uv = -dst_stride_uv;
     }
 
@@ -1153,16 +1140,16 @@ int I422ToNV21(const uint8_t* src_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_vu || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -1217,6 +1204,14 @@ int I422ToNV21(const uint8_t* src_y,
     MergeUVRow = MergeUVRow_RVV;
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
@@ -1317,15 +1312,15 @@ int I444ToNV12(const uint8_t* src_y,
                int width,
                int height) {
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_uv || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -1370,15 +1365,14 @@ int I400ToI420(const uint8_t* src_y,
                int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if ((!src_y && dst_y) || !dst_u || !dst_v || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if ((!src_y && dst_y) || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
+    src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
   if (dst_y) {
@@ -1401,15 +1395,14 @@ int I400ToNV21(const uint8_t* src_y,
                int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if ((!src_y && dst_y) || !dst_vu || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if ((!src_y && dst_y) || !dst_vu || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
+    src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
   if (dst_y) {
@@ -1437,29 +1430,27 @@ int NV12ToI420(const uint8_t* src_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   if ((!src_y && dst_y) || !src_uv || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_uv = src_uv + (ptrdiff_t)(halfheight - 1) * src_stride_uv;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
     src_stride_y = -src_stride_y;
     src_stride_uv = -src_stride_uv;
   }
   // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
   }
   // Coalesce rows.
   if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
-      dst_stride_v == halfwidth &&
-      (ptrdiff_t)halfwidth * halfheight <= INT_MAX) {
+      dst_stride_v == halfwidth) {
     halfwidth *= halfheight;
     halfheight = 1;
     src_stride_uv = dst_stride_u = dst_stride_v = 0;
@@ -1507,8 +1498,7 @@ int NV12ToNV24(const uint8_t* src_y,
                int width,
                int height) {
   int r;
-  if ((!src_y && dst_y) || !src_uv || !dst_uv || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if ((!src_y && dst_y) || !src_uv || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
 
@@ -1533,8 +1523,7 @@ int NV16ToNV24(const uint8_t* src_y,
                int width,
                int height) {
   int r;
-  if ((!src_y && dst_y) || !src_uv || !dst_uv || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if ((!src_y && dst_y) || !src_uv || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
 
@@ -1565,7 +1554,7 @@ static int PxxxToIxxx(const uint16_t* src_y,
   const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
   const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
   if (!src_y || !dst_y || !src_uv || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
@@ -1623,8 +1612,7 @@ int P010ToP410(const uint16_t* src_y,
                int width,
                int height) {
   int r;
-  if ((!src_y && dst_y) || !src_uv || !dst_uv || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if ((!src_y && dst_y) || !src_uv || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
 
@@ -1649,8 +1637,7 @@ int P210ToP410(const uint16_t* src_y,
                int width,
                int height) {
   int r;
-  if ((!src_y && dst_y) || !src_uv || !dst_uv || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if ((!src_y && dst_y) || !src_uv || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
 
@@ -1680,13 +1667,10 @@ int YUY2ToI420(const uint8_t* src_yuy2,
       YUY2ToUVRow_C;
   void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
       YUY2ToYRow_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
-    return -1;
-  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_yuy2 = src_yuy2 + (ptrdiff_t)(height - 1) * src_stride_yuy2;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
     src_stride_yuy2 = -src_stride_yuy2;
   }
 #if defined(HAS_YUY2TOYROW_SSE2)
@@ -1774,13 +1758,10 @@ int UYVYToI420(const uint8_t* src_uyvy,
       UYVYToUVRow_C;
   void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
       UYVYToYRow_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
-    return -1;
-  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_uyvy = src_uyvy + (ptrdiff_t)(height - 1) * src_stride_uyvy;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
     src_stride_uyvy = -src_stride_uyvy;
   }
 #if defined(HAS_UYVYTOYROW_SSE2)
@@ -1875,13 +1856,10 @@ int AYUVToNV12(const uint8_t* src_ayuv,
                       uint8_t* dst_uv, int width) = AYUVToUVRow_C;
   void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
       AYUVToYRow_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
-    return -1;
-  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_ayuv = src_ayuv + (ptrdiff_t)(height - 1) * src_stride_ayuv;
+    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
     src_stride_ayuv = -src_stride_ayuv;
   }
 // place holders for future intel code
@@ -1955,13 +1933,10 @@ int AYUVToNV21(const uint8_t* src_ayuv,
                       uint8_t* dst_vu, int width) = AYUVToVURow_C;
   void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
       AYUVToYRow_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
-    return -1;
-  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_ayuv = src_ayuv + (ptrdiff_t)(height - 1) * src_stride_ayuv;
+    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
     src_stride_ayuv = -src_stride_ayuv;
   }
 // place holders for future intel code
@@ -2055,7 +2030,7 @@ int ARGBToI420Matrix(const uint8_t* src_argb,
   void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                             uint8_t* dst_u, uint8_t* dst_v, int width,
                             const struct ArgbConstants* c) =
-      ARGBToUVMatrixRow_C;
+ARGBToUVMatrixRow_C;
 
 #if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -2119,40 +2094,10 @@ int ARGBToI420Matrix(const uint8_t* src_argb,
   }
 #endif
 
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-  if (TestCpuFlag(kCpuHasSVE2)) {
-    if (IS_ALIGNED(width, 2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SME)
-  if (TestCpuFlag(kCpuHasSME)) {
-    if (IS_ALIGNED(width, 2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
-    }
-  }
-#endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3;
     }
   }
@@ -2172,15 +2117,61 @@ int ARGBToI420Matrix(const uint8_t* src_argb,
       ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW;
     }
   }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LASX;
+    }
+  }
 #endif
   if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
 
@@ -2206,7 +2197,7 @@ int ARGBToI420Matrix(const uint8_t* src_argb,
 // Convert ARGB to I420 with Alpha
 // The following version calls ARGBExtractAlpha on the full image.
 LIBYUV_API
-int ARGBToI420Alpha(const uint8_t* src_argb,
+int ARGBToI420AlphaMatrix(const uint8_t* src_argb,
                     int src_stride_argb,
                     uint8_t* dst_y,
                     int dst_stride_y,
@@ -2217,19 +2208,19 @@ int ARGBToI420Alpha(const uint8_t* src_argb,
                     uint8_t* dst_a,
                     int dst_stride_a,
                     int width,
-                    int height) {
-  int r = ARGBToI420(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                     dst_stride_u, dst_v, dst_stride_v, width, height);
+                    int height,
+                    const struct ArgbConstants* argbconstants) {
+  int r = ARGBToI420Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, argbconstants, width, height);
   if (r == 0) {
     r = ARGBExtractAlpha(src_argb, src_stride_argb, dst_a, dst_stride_a, width,
                          height);
   }
   return r;
 }
-#else  // USE_EXTRACTALPHA
-// Convert ARGB to I420 with Alpha
+
 LIBYUV_API
-int ARGBToI420Alpha(const uint8_t* src_argb,
+int ARGBToI420AlphaMatrix(const uint8_t* src_argb,
                     int src_stride_argb,
                     uint8_t* dst_y,
                     int dst_stride_y,
@@ -2240,139 +2231,181 @@ int ARGBToI420Alpha(const uint8_t* src_argb,
                     uint8_t* dst_a,
                     int dst_stride_a,
                     int width,
-                    int height) {
+                    int height,
+                    const struct ArgbConstants* argbconstants) {
+  return ARGBToI420AlphaMatrix(src_argb, src_stride_argb, dst_y, dst_stride_y,
+                               dst_u, dst_stride_u, dst_v, dst_stride_v,
+                               dst_a, dst_stride_a, width, height,
+                               &kArgbI601Constants);
+}
+#else  // USE_EXTRACTALPHA
+// Convert ARGB to I420 with Alpha
+LIBYUV_API
+int ARGBToI420AlphaMatrix(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height,
+                    const struct ArgbConstants* argbconstants) {
   int y;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
+  void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
+                            uint8_t* dst_u, uint8_t* dst_v, int width,
+                            const struct ArgbConstants* c) =
+      ARGBToUVMatrixRow_C;
+  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
+                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
   void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,
                               int width) = ARGBExtractAlphaRow_C;
   if (!src_argb || !dst_y || !dst_u || !dst_v || !dst_a || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
+
+#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
+  }
+#endif
+
+#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LASX;
+    }
+  }
+#endif
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON_DOTPROD)
-  if (TestCpuFlag(kCpuHasNeonDotProd)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON_DotProd;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_NEON_DotProd;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON_I8MM)
-  if (TestCpuFlag(kCpuHasNeonI8MM)) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON_I8MM;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON_I8MM;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_SVE2)
-  if (TestCpuFlag(kCpuHasSVE2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SVE2;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBToUVRow = ARGBToUVRow_SVE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_SME)
-  if (TestCpuFlag(kCpuHasSME)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SME;
-    if (IS_ALIGNED(width, 2)) {
-      ARGBToUVRow = ARGBToUVRow_SME;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToYRow = ARGBToYRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToYRow = ARGBToYRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVRow = ARGBToUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToUVRow = ARGBToUVRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToUVRow = ARGBToUVRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBToYRow = ARGBToYRow_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    ARGBToYRow = ARGBToYRow_Any_LASX;
-    ARGBToUVRow = ARGBToUVRow_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYRow = ARGBToYRow_LASX;
-      ARGBToUVRow = ARGBToUVRow_LASX;
-    }
-  }
-#endif
 #if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
@@ -2404,9 +2437,10 @@ int ARGBToI420Alpha(const uint8_t* src_argb,
 #endif
 
   for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    ARGBToUVMatrixRow(src_argb, src_stride_argb, dst_u, dst_v, width, argbconstants);
+    ARGBToYMatrixRow(src_argb, dst_y, width, argbconstants);
+    ARGBToYMatrixRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width,
+                       argbconstants);
     ARGBExtractAlphaRow(src_argb, dst_a, width);
     ARGBExtractAlphaRow(src_argb + src_stride_argb, dst_a + dst_stride_a,
                         width);
@@ -2417,12 +2451,31 @@ int ARGBToI420Alpha(const uint8_t* src_argb,
     dst_a += dst_stride_a * 2;
   }
   if (height & 1) {
-    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToUVMatrixRow(src_argb, 0, dst_u, dst_v, width, argbconstants);
+    ARGBToYMatrixRow(src_argb, dst_y, width, argbconstants);
     ARGBExtractAlphaRow(src_argb, dst_a, width);
   }
   return 0;
 }
+
+LIBYUV_API
+int ARGBToI420Alpha(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    uint8_t* dst_a,
+                    int dst_stride_a,
+                    int width,
+                    int height) {
+  return ARGBToI420AlphaMatrix(src_argb, src_stride_argb, dst_y, dst_stride_y,
+                               dst_u, dst_stride_u, dst_v, dst_stride_v,
+                               dst_a, dst_stride_a, width, height,
+                               &kArgbI601Constants);
+}
 #endif  // USE_EXTRACTALPHA
 
 // Convert BGRA to I420.
@@ -2437,60 +2490,147 @@ int BGRAToI420(const uint8_t* src_bgra,
                int dst_stride_v,
                int width,
                int height) {
-  return ARGBToI420Matrix(src_bgra, src_stride_bgra, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v,
-                          &kBgraI601Constants, width, height);
-}
+  int y;
+  void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      BGRAToUVRow_C;
+  void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) =
+      BGRAToYRow_C;
+  if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+#if defined(HAS_BGRATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToYRow = BGRAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    BGRAToYRow = BGRAToYRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToUVRow = BGRAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    BGRAToUVRow = BGRAToUVRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    BGRAToUVRow = BGRAToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      BGRAToUVRow = BGRAToUVRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    BGRAToUVRow = BGRAToUVRow_Any_SME;
+    if (IS_ALIGNED(width, 2)) {
+      BGRAToUVRow = BGRAToUVRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BGRAToYRow = BGRAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    BGRAToYRow = BGRAToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      BGRAToYRow = BGRAToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    BGRAToYRow = BGRAToYRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      BGRAToYRow = BGRAToYRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    BGRAToUVRow = BGRAToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      BGRAToUVRow = BGRAToUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_LSX) && defined(HAS_BGRATOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    BGRAToYRow = BGRAToYRow_Any_LSX;
+    BGRAToUVRow = BGRAToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_LSX;
+      BGRAToUVRow = BGRAToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    BGRAToYRow = BGRAToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      BGRAToYRow = BGRAToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    BGRAToYRow = BGRAToYRow_RVV;
+  }
+#endif
 
-// Convert BGRA to I422.
-LIBYUV_API
-int BGRAToI422(const uint8_t* src_bgra,
-               int src_stride_bgra,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return ARGBToI422Matrix(src_bgra, src_stride_bgra, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v,
-                          &kBgraI601Constants, width, height);
-}
-
-// Convert ABGR to I422.
-LIBYUV_API
-int ABGRToI422(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return ARGBToI422Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v,
-                          &kAbgrI601Constants, width, height);
-}
-
-// Convert RGBA to I422.
-LIBYUV_API
-int RGBAToI422(const uint8_t* src_rgba,
-               int src_stride_rgba,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return ARGBToI422Matrix(src_rgba, src_stride_rgba, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v,
-                          &kRgbaI601Constants, width, height);
+  for (y = 0; y < height - 1; y += 2) {
+    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+    BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+    src_bgra += src_stride_bgra * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+  }
+  return 0;
 }
 
 // Convert ABGR to I420.
@@ -2505,9 +2645,147 @@ int ABGRToI420(const uint8_t* src_abgr,
                int dst_stride_v,
                int width,
                int height) {
-  return ARGBToI420Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v,
-                          &kAbgrI601Constants, width, height);
+  int y;
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+      ABGRToYRow_C;
+  if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToYRow = ABGRToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ABGRToYRow = ABGRToYRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ABGRToYRow = ABGRToYRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVRow = ABGRToUVRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SME;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVRow = ABGRToUVRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_LSX) && defined(HAS_ABGRTOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYRow = ABGRToYRow_Any_LSX;
+    ABGRToUVRow = ABGRToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_LSX;
+      ABGRToUVRow = ABGRToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYRow = ABGRToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYRow = ABGRToYRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+    ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+    src_abgr += src_stride_abgr * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+  }
+  return 0;
 }
 
 // Convert RGBA to I420.
@@ -2522,16 +2800,334 @@ int RGBAToI420(const uint8_t* src_rgba,
                int dst_stride_v,
                int width,
                int height) {
-  return ARGBToI420Matrix(src_rgba, src_stride_rgba, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v,
-                          &kRgbaI601Constants, width, height);
+  int y;
+  void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGBAToUVRow_C;
+  void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) =
+      RGBAToYRow_C;
+  if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+#if defined(HAS_RGBATOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGBAToYRow = RGBAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYRow = RGBAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGBAToYRow = RGBAToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      RGBAToYRow = RGBAToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    RGBAToYRow = RGBAToYRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      RGBAToYRow = RGBAToYRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYRow = RGBAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYRow = RGBAToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    RGBAToYRow = RGBAToYRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYRow = RGBAToYRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToUVRow = RGBAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    RGBAToUVRow = RGBAToUVRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    RGBAToUVRow = RGBAToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      RGBAToUVRow = RGBAToUVRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    RGBAToUVRow = RGBAToUVRow_Any_SME;
+    if (IS_ALIGNED(width, 2)) {
+      RGBAToUVRow = RGBAToUVRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_LSX) && defined(HAS_RGBATOUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGBAToYRow = RGBAToYRow_Any_LSX;
+    RGBAToUVRow = RGBAToUVRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYRow = RGBAToYRow_LSX;
+      RGBAToUVRow = RGBAToUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_LASX)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYRow = RGBAToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGBAToYRow = RGBAToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGBAToYRow = RGBAToYRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+    RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+    src_rgba += src_stride_rgba * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+  }
+  return 0;
 }
 
-// Enabled if 1 pass is available
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_LSX) || \
-     defined(HAS_RGB24TOYROW_RVV))
-#define HAS_RGB24TOYROW
+// Any RGB to I420 with Matrix
+static int RGBToI420Matrix(const uint8_t* src_rgb,
+                           int src_stride_rgb,
+                           uint8_t* dst_y,
+                           int dst_stride_y,
+                           uint8_t* dst_u,
+                           int dst_stride_u,
+                           uint8_t* dst_v,
+                           int dst_stride_v,
+                           int width,
+                           int height,
+                           const struct ArgbConstants* argbconstants,
+                           void (*RGBToARGBRow)(const uint8_t* src_rgb,
+                                                uint8_t* dst_argb,
+                                                int width)) {
+  int y;
+  void (*ARGBToUVMatrixRow)(const uint8_t* src_argb0, int src_stride_argb,
+                            uint8_t* dst_u, uint8_t* dst_v, int width,
+                            const struct ArgbConstants* c) =
+      ARGBToUVMatrixRow_C;
+  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
+                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
+
+#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
+    }
+  }
 #endif
+#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
+  }
+#endif
+
+#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LASX;
+    }
+  }
+#endif
+
+  if (!src_rgb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb = src_rgb + (height - 1) * src_stride_rgb;
+    src_stride_rgb = -src_stride_rgb;
+  }
+
+  {
+    // Allocate 2 rows of ARGB.
+    const int row_size = (width * 4 + 31) & ~31;
+    align_buffer_64(row, row_size * 2);
+    if (!row)
+      return 1;
+
+    for (y = 0; y < height - 1; y += 2) {
+      RGBToARGBRow(src_rgb, row, width);
+      RGBToARGBRow(src_rgb + src_stride_rgb, row + row_size, width);
+      ARGBToUVMatrixRow(row, row_size, dst_u, dst_v, width, argbconstants);
+      ARGBToYMatrixRow(row, dst_y, width, argbconstants);
+      ARGBToYMatrixRow(row + row_size, dst_y + dst_stride_y, width, argbconstants);
+      src_rgb += src_stride_rgb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+      RGBToARGBRow(src_rgb, row, width);
+      ARGBToUVMatrixRow(row, 0, dst_u, dst_v, width, argbconstants);
+      ARGBToYMatrixRow(row, dst_y, width, argbconstants);
+    }
+    free_aligned_buffer_64(row);
+  }
+  return 0;
+}
 
 // Convert RGB24 to I420.
 LIBYUV_API
@@ -2545,171 +3141,8 @@ int RGB24ToI420(const uint8_t* src_rgb24,
                 int dst_stride_v,
                 int width,
                 int height) {
-  int y;
-  void (*RGBToUVMatrixRow)(const uint8_t* src_rgb, int src_stride_rgb,
-                           uint8_t* dst_u, uint8_t* dst_v, int width,
-                           const struct ArgbConstants* c) = RGBToUVMatrixRow_C;
-  void (*RGBToYMatrixRow)(const uint8_t* src_rgb, uint8_t* dst_y, int width,
-                          const struct ArgbConstants* c) = RGBToYMatrixRow_C;
-
-#if defined(HAS_RGBTOYMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGBToYMatrixRow = RGBToYMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      RGBToYMatrixRow = RGBToYMatrixRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_RGBTOUVMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGBToUVMatrixRow = RGBToUVMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      RGBToUVMatrixRow = RGBToUVMatrixRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_RGBTOUVMATRIXROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    RGBToUVMatrixRow = RGBToUVMatrixRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      RGBToUVMatrixRow = RGBToUVMatrixRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_RGBTOUVMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGBToUVMatrixRow = RGBToUVMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      RGBToUVMatrixRow = RGBToUVMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGBTOYMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGBToYMatrixRow = RGBToYMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      RGBToYMatrixRow = RGBToYMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGBTOYMATRIXROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    RGBToYMatrixRow = RGBToYMatrixRow_LSX;  // This uses the NEON/LSX names
-  }
-#endif
-
-  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0 ||
-      height == INT_MIN) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (ptrdiff_t)(height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-
-  for (y = 0; y < height - 1; y += 2) {
-    RGBToUVMatrixRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width,
-                     &kArgbI601Constants);
-    RGBToYMatrixRow(src_rgb24, dst_y, width, &kArgbI601Constants);
-    RGBToYMatrixRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width,
-                    &kArgbI601Constants);
-    src_rgb24 += src_stride_rgb24 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    RGBToYMatrixRow(src_rgb24, dst_y, width, &kArgbI601Constants);
-    RGBToUVMatrixRow(src_rgb24, 0, dst_u, dst_v, width, &kArgbI601Constants);
-  }
-  return 0;
-}
-#undef HAS_RGB24TOYROW
-
-// Enabled if 1 pass is available
-#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_RVV)
-#define HAS_RGB24TOYJROW
-#endif
-
-// Convert RGB24 to J420.
-LIBYUV_API
-int RGB24ToJ420(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_y,
-                int dst_stride_y,
-                uint8_t* dst_u,
-                int dst_stride_u,
-                uint8_t* dst_v,
-                int dst_stride_v,
-                int width,
-                int height) {
-  int y;
-#if defined(HAS_RGB24TOYJROW)
-  void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
-                        uint8_t* dst_u, uint8_t* dst_v, int width) =
-      RGB24ToUVJRow_C;
-  void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
-      RGB24ToYJRow_C;
-#else
   void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RGB24ToARGBRow_C;
-  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYJRow_C;
-#endif
-  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0 ||
-      height == INT_MIN) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (ptrdiff_t)(height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-
-#if defined(HAS_RGB24TOYJROW)
-
-// Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
-    RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToYJRow = RGB24ToYJRow_NEON;
-      RGB24ToUVJRow = RGB24ToUVJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOYJROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    RGB24ToYJRow = RGB24ToYJRow_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToYJRow = RGB24ToYJRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOYJROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    RGB24ToYJRow = RGB24ToYJRow_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      RGB24ToYJRow = RGB24ToYJRow_LASX;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOYJROW_RVV)
-  if (TestCpuFlag(kCpuHasRVV)) {
-    RGB24ToYJRow = RGB24ToYJRow_RVV;
-  }
-#endif
-
-// Other platforms do intermediate conversion from RGB24 to ARGB.
-#else  // HAS_RGB24TOYJROW
-
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
@@ -2768,102 +3201,94 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
     RGB24ToARGBRow = RGB24ToARGBRow_RVV;
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToYJRow = ARGBToYJRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVJRow = ARGBToUVJRow_AVX2;
-    }
-  }
-#endif
-#endif  // HAS_RGB24TOYJROW
 
-  {
-#if !defined(HAS_RGB24TOYJROW)
-    // Allocate 2 rows of ARGB.
-    const int row_size = (width * 4 + 31) & ~31;
-    align_buffer_64(row, row_size * 2);
-    if (!row)
-      return 1;
-#endif
-
-    for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB24TOYJROW)
-      RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
-      RGB24ToYJRow(src_rgb24, dst_y, width);
-      RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + row_size, width);
-      ARGBToUVJRow(row, row_size, dst_u, dst_v, width);
-      ARGBToYJRow(row, dst_y, width);
-      ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
-#endif
-      src_rgb24 += src_stride_rgb24 * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-#if defined(HAS_RGB24TOYJROW)
-      RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
-      RGB24ToYJRow(src_rgb24, dst_y, width);
-#else
-      RGB24ToARGBRow(src_rgb24, row, width);
-      ARGBToUVJRow(row, 0, dst_u, dst_v, width);
-      ARGBToYJRow(row, dst_y, width);
-#endif
-    }
-#if !defined(HAS_RGB24TOYJROW)
-    free_aligned_buffer_64(row);
-#endif
-  }
-  return 0;
+  return RGBToI420Matrix(src_rgb24, src_stride_rgb24, dst_y, dst_stride_y,
+                         dst_u, dst_stride_u, dst_v, dst_stride_v, width,
+                         height, &kArgbI601Constants, RGB24ToARGBRow);
 }
-#undef HAS_RGB24TOYJROW
 
-// Enabled if 1 pass is available
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_LSX) || \
-     defined(HAS_RAWTOYROW_RVV))
-#define HAS_RAWTOYROW
+// Convert RGB24 to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height) {
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RGB24ToARGBRow_C;
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
 #endif
+#if defined(HAS_RGB24TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_SVE2;
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_RVV;
+  }
+#endif
+
+  return RGBToI420Matrix(src_rgb24, src_stride_rgb24, dst_y, dst_stride_y,
+                         dst_u, dst_stride_u, dst_v, dst_stride_v, width,
+                         height, &kArgbJPEGConstants, RGB24ToARGBRow);
+}
 
 // Convert RAW to I420.
 LIBYUV_API
-int RAWToI420(const uint8_t* src_rgb24,
-              int src_stride_rgb24,
+int RAWToI420(const uint8_t* src_raw,
+              int src_stride_raw,
               uint8_t* dst_y,
               int dst_stride_y,
               uint8_t* dst_u,
@@ -2872,93 +3297,71 @@ int RAWToI420(const uint8_t* src_rgb24,
               int dst_stride_v,
               int width,
               int height) {
-  int y;
-  void (*RGBToUVMatrixRow)(const uint8_t* src_rgb, int src_stride_rgb,
-                           uint8_t* dst_u, uint8_t* dst_v, int width,
-                           const struct ArgbConstants* c) = RGBToUVMatrixRow_C;
-  void (*RGBToYMatrixRow)(const uint8_t* src_rgb, uint8_t* dst_y, int width,
-                          const struct ArgbConstants* c) = RGBToYMatrixRow_C;
-
-#if defined(HAS_RGBTOYMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGBToYMatrixRow = RGBToYMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      RGBToYMatrixRow = RGBToYMatrixRow_AVX2;
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RAWToARGBRow_C;
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_RGBTOUVMATRIXROW_AVX2)
+#if defined(HAS_RAWTOARGBROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    RGBToUVMatrixRow = RGBToUVMatrixRow_Any_AVX2;
+    RAWToARGBRow = RAWToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      RGBToUVMatrixRow = RGBToUVMatrixRow_AVX2;
+      RAWToARGBRow = RAWToARGBRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_RGBTOUVMATRIXROW_AVX512BW)
+#if defined(HAS_RAWTOARGBROW_AVX512BW)
   if (TestCpuFlag(kCpuHasAVX512BW)) {
-    RGBToUVMatrixRow = RGBToUVMatrixRow_Any_AVX512BW;
+    RAWToARGBRow = RAWToARGBRow_Any_AVX512BW;
     if (IS_ALIGNED(width, 64)) {
-      RGBToUVMatrixRow = RGBToUVMatrixRow_AVX512BW;
+      RAWToARGBRow = RAWToARGBRow_AVX512BW;
     }
   }
 #endif
-#if defined(HAS_RGBTOUVMATRIXROW_NEON)
+#if defined(HAS_RAWTOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    RGBToUVMatrixRow = RGBToUVMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      RGBToUVMatrixRow = RGBToUVMatrixRow_NEON;
+    RAWToARGBRow = RAWToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_NEON;
     }
   }
 #endif
-#if defined(HAS_RGBTOYMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGBToYMatrixRow = RGBToYMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      RGBToYMatrixRow = RGBToYMatrixRow_NEON;
-    }
+#if defined(HAS_RAWTOARGBROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    RAWToARGBRow = RAWToARGBRow_SVE2;
   }
 #endif
-#if defined(HAS_RGBTOYMATRIXROW_LSX)
+#if defined(HAS_RAWTOARGBROW_LSX)
   if (TestCpuFlag(kCpuHasLSX)) {
-    RGBToYMatrixRow = RGBToYMatrixRow_LSX;  // This uses the NEON/LSX names
+    RAWToARGBRow = RAWToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RAWToARGBRow = RAWToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToARGBRow = RAWToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToARGBRow = RAWToARGBRow_RVV;
   }
 #endif
 
-  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0 ||
-      height == INT_MIN) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb24 = src_rgb24 + (ptrdiff_t)(height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
-  }
-
-  for (y = 0; y < height - 1; y += 2) {
-    RGBToUVMatrixRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width,
-                     &kArgbI601Constants);
-    RGBToYMatrixRow(src_rgb24, dst_y, width, &kArgbI601Constants);
-    RGBToYMatrixRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width,
-                    &kArgbI601Constants);
-    src_rgb24 += src_stride_rgb24 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    RGBToYMatrixRow(src_rgb24, dst_y, width, &kArgbI601Constants);
-    RGBToUVMatrixRow(src_rgb24, 0, dst_u, dst_v, width, &kArgbI601Constants);
-  }
-  return 0;
+  return RGBToI420Matrix(src_raw, src_stride_raw, dst_y, dst_stride_y, dst_u,
+                         dst_stride_u, dst_v, dst_stride_v, width, height,
+                         &kArgbI601Constants, RAWToARGBRow);
 }
-#undef HAS_RAWTOYROW
-
-// Enabled if 1 pass is available
-#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_RVV)
-#define HAS_RAWTOYJROW
-#endif
 
 // Convert RAW to J420.
 LIBYUV_API
@@ -2972,75 +3375,8 @@ int RAWToJ420(const uint8_t* src_raw,
               int dst_stride_v,
               int width,
               int height) {
-  int y;
-#if defined(HAS_RAWTOYJROW)
-  void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
-                      uint8_t* dst_u, uint8_t* dst_v, int width) =
-      RAWToUVJRow_C;
-  void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
-      RAWToYJRow_C;
-#else
   void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RAWToARGBRow_C;
-  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) =
-      ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
-      ARGBToYJRow_C;
-#endif
-  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0 ||
-      height == INT_MIN) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (ptrdiff_t)(height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-
-#if defined(HAS_RAWTOYJROW)
-
-// Neon version does direct RAW to YUV.
-#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToUVJRow = RAWToUVJRow_Any_NEON;
-    RAWToYJRow = RAWToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToYJRow = RAWToYJRow_NEON;
-      RAWToUVJRow = RAWToUVJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOYJROW_LSX) && defined(HAS_RAWTOUVJROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    RAWToUVJRow = RAWToUVJRow_Any_LSX;
-    RAWToYJRow = RAWToYJRow_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToYJRow = RAWToYJRow_LSX;
-      RAWToUVJRow = RAWToUVJRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOYJROW_LASX) && defined(HAS_RAWTOUVJROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    RAWToUVJRow = RAWToUVJRow_Any_LASX;
-    RAWToYJRow = RAWToYJRow_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      RAWToYJRow = RAWToYJRow_LASX;
-      RAWToUVJRow = RAWToUVJRow_LASX;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOYJROW_RVV)
-  if (TestCpuFlag(kCpuHasRVV)) {
-    RAWToYJRow = RAWToYJRow_RVV;
-  }
-#endif
-
-// Other platforms do intermediate conversion from RAW to ARGB.
-#else  // HAS_RAWTOYJROW
-
 #if defined(HAS_RAWTOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@@ -3099,85 +3435,160 @@ int RAWToJ420(const uint8_t* src_raw,
     RAWToARGBRow = RAWToARGBRow_RVV;
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVJRow = ARGBToUVJRow_AVX2;
-    }
-  }
-#endif
-#endif  // HAS_RAWTOYJROW
 
+  return RGBToI420Matrix(src_raw, src_stride_raw, dst_y, dst_stride_y, dst_u,
+                         dst_stride_u, dst_v, dst_stride_v, width, height,
+                         &kArgbJPEGConstants, RAWToARGBRow);
+}
+
+
+// RAW big endian (rgb in memory) to I444
+static int RGBToI444Matrix(const uint8_t* src_rgb,
+                           int src_stride_rgb,
+                           uint8_t* dst_y,
+                           int dst_stride_y,
+                           uint8_t* dst_u,
+                           int dst_stride_u,
+                           uint8_t* dst_v,
+                           int dst_stride_v,
+                           int width,
+                           int height,
+                           const struct ArgbConstants* argbconstants,
+                           void (*RGBToARGBRow)(const uint8_t* src_rgb,
+                                                uint8_t* dst_argb,
+                                                int width)) {
+  int y;
+  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
+                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
+  void (*ARGBToUV444MatrixRow)(const uint8_t* src_argb, uint8_t* dst_u,
+                               uint8_t* dst_v, int width,
+                               const struct ArgbConstants* c) =
+      ARGBToUV444MatrixRow_C;
+
+#if defined(HAS_ARGBTOUV444MATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444MATRIXROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444MATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444MATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
+  }
+#endif
+
+  if (!src_rgb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_rgb = src_rgb + (height - 1) * src_stride_rgb;
+    src_stride_rgb = -src_stride_rgb;
+  }
   {
-#if !defined(HAS_RAWTOYJROW)
-    // Allocate 2 rows of ARGB.
-    const int row_size = (width * 4 + 31) & ~31;
-    align_buffer_64(row, row_size * 2);
+    // Allocate a row of ARGB.
+    const int row_size = width * 4;
+    align_buffer_64(row, row_size);
     if (!row)
       return 1;
-#endif
 
-    for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RAWTOYJROW)
-      RAWToUVJRow(src_raw, src_stride_raw, dst_u, dst_v, width);
-      RAWToYJRow(src_raw, dst_y, width);
-      RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
-#else
-      RAWToARGBRow(src_raw, row, width);
-      RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
-      ARGBToUVJRow(row, row_size, dst_u, dst_v, width);
-      ARGBToYJRow(row, dst_y, width);
-      ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
-#endif
-      src_raw += src_stride_raw * 2;
-      dst_y += dst_stride_y * 2;
+    for (y = 0; y < height; ++y) {
+      RGBToARGBRow(src_rgb, row, width);
+      ARGBToUV444MatrixRow(row, dst_u, dst_v, width, argbconstants);
+      ARGBToYMatrixRow(row, dst_y, width, argbconstants);
+      src_rgb += src_stride_rgb;
+      dst_y += dst_stride_y;
       dst_u += dst_stride_u;
       dst_v += dst_stride_v;
     }
-    if (height & 1) {
-#if defined(HAS_RAWTOYJROW)
-      RAWToUVJRow(src_raw, 0, dst_u, dst_v, width);
-      RAWToYJRow(src_raw, dst_y, width);
-#else
-      RAWToARGBRow(src_raw, row, width);
-      ARGBToUVJRow(row, 0, dst_u, dst_v, width);
-      ARGBToYJRow(row, dst_y, width);
-#endif
-    }
-#if !defined(HAS_RAWTOYJROW)
     free_aligned_buffer_64(row);
-#endif
   }
   return 0;
 }
-#undef HAS_RAWTOYJROW
 
-// RAW big endian (rgb in memory) to I444
 // 2 step conversion of RAWToARGB then ARGBToY and ARGBToUV444
 LIBYUV_API
 int RAWToI444(const uint8_t* src_raw,
@@ -3190,142 +3601,8 @@ int RAWToI444(const uint8_t* src_raw,
               int dst_stride_v,
               int width,
               int height) {
-  int y;
-  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RAWToARGBRow_C;
-  void (*ARGBToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
-      ARGBToYRow_C;
-  void (*ARGBToUV444Row)(const uint8_t* src_raw, uint8_t* dst_u, uint8_t* dst_v,
-                         int width) = ARGBToUV444Row_C;
-  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0 ||
-      height == INT_MIN) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (ptrdiff_t)(height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-  // TODO: add row coalesce when main loop handles large width in blocks
-  // TODO: implement UV444 or trim the ifdef below
-#if defined(HAS_ARGBTOUV444ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUV444Row = ARGBToUV444Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV444ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUV444Row = ARGBToUV444Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUV444Row = ARGBToUV444Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV444ROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToUV444Row = ARGBToUV444Row_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToUV444Row = ARGBToUV444Row_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToUV444Row = ARGBToUV444Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV444ROW_NEON_I8MM)
-  if (TestCpuFlag(kCpuHasNeonI8MM)) {
-    ARGBToUV444Row = ARGBToUV444Row_Any_NEON_I8MM;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToUV444Row = ARGBToUV444Row_NEON_I8MM;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV444ROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBToUV444Row = ARGBToUV444Row_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUV444Row = ARGBToUV444Row_LSX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV444ROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    ARGBToUV444Row = ARGBToUV444Row_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUV444Row = ARGBToUV444Row_LASX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYRow = ARGBToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToYRow = ARGBToYRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToYRow = ARGBToYRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_NEON_DOTPROD)
-  if (TestCpuFlag(kCpuHasNeonDotProd)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON_DotProd;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_NEON_DotProd;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBToYRow = ARGBToYRow_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    ARGBToYRow = ARGBToYRow_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYRow = ARGBToYRow_LASX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_RVV)
-  if (TestCpuFlag(kCpuHasRVV)) {
-    ARGBToYRow = ARGBToYRow_RVV;
-  }
-#endif
-
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                          int width) = RAWToARGBRow_C;
 #if defined(HAS_RAWTOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@@ -3384,26 +3661,10 @@ int RAWToI444(const uint8_t* src_raw,
     RAWToARGBRow = RAWToARGBRow_RVV;
   }
 #endif
-
-  {
-    // Allocate a row of ARGB.
-    const int row_size = width * 4;
-    align_buffer_64(row, row_size);
-    if (!row)
-      return 1;
-
-    for (y = 0; y < height; ++y) {
-      RAWToARGBRow(src_raw, row, width);
-      ARGBToUV444Row(row, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      src_raw += src_stride_raw;
-      dst_y += dst_stride_y;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    free_aligned_buffer_64(row);
-  }
-  return 0;
+  return RGBToI444Matrix(src_raw, src_stride_raw,
+                         dst_y, dst_stride_y, dst_u, dst_stride_u,
+                         dst_v, dst_stride_v, width, height,
+                         &kArgbI601Constants, RAWToARGBRow);
 }
 
 // RAW big endian (rgb in memory) to J444
@@ -3419,133 +3680,8 @@ int RAWToJ444(const uint8_t* src_raw,
               int dst_stride_v,
               int width,
               int height) {
-  int y;
-  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RAWToARGBRow_C;
-  void (*ARGBToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
-      ARGBToYJRow_C;
-  void (*ARGBToUVJ444Row)(const uint8_t* src_raw, uint8_t* dst_u,
-                          uint8_t* dst_v, int width) = ARGBToUVJ444Row_C;
-  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0 ||
-      height == INT_MIN) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (ptrdiff_t)(height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-  // TODO: add row coalesce when main loop handles large width in blocks
-#if defined(HAS_ARGBTOUVJ444ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVJ444Row = ARGBToUVJ444Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJ444Row = ARGBToUVJ444Row_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJ444ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVJ444Row = ARGBToUVJ444Row_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVJ444Row = ARGBToUVJ444Row_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJ444ROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToUVJ444Row = ARGBToUVJ444Row_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToUVJ444Row = ARGBToUVJ444Row_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJ444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVJ444Row = ARGBToUVJ444Row_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToUVJ444Row = ARGBToUVJ444Row_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJ444ROW_NEON_I8MM)
-  if (TestCpuFlag(kCpuHasNeonI8MM)) {
-    ARGBToUVJ444Row = ARGBToUVJ444Row_Any_NEON_I8MM;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToUVJ444Row = ARGBToUVJ444Row_NEON_I8MM;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJ444ROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBToUVJ444Row = ARGBToUVJ444Row_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJ444Row = ARGBToUVJ444Row_LSX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVJ444ROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    ARGBToUVJ444Row = ARGBToUVJ444Row_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVJ444Row = ARGBToUVJ444Row_LASX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_NEON_DOTPROD)
-  if (TestCpuFlag(kCpuHasNeonDotProd)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON_DotProd;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_NEON_DotProd;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBToYJRow = ARGBToYJRow_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    ARGBToYJRow = ARGBToYJRow_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_LASX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYJROW_RVV)
-  if (TestCpuFlag(kCpuHasRVV)) {
-    ARGBToYJRow = ARGBToYJRow_RVV;
-  }
-#endif
-
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                          int width) = RAWToARGBRow_C;
 #if defined(HAS_RAWTOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@@ -3604,26 +3740,10 @@ int RAWToJ444(const uint8_t* src_raw,
     RAWToARGBRow = RAWToARGBRow_RVV;
   }
 #endif
-
-  {
-    // Allocate a row of ARGB.
-    const int row_size = width * 4;
-    align_buffer_64(row, row_size);
-    if (!row)
-      return 1;
-
-    for (y = 0; y < height; ++y) {
-      RAWToARGBRow(src_raw, row, width);
-      ARGBToUVJ444Row(row, dst_u, dst_v, width);
-      ARGBToYJRow(row, dst_y, width);
-      src_raw += src_stride_raw;
-      dst_y += dst_stride_y;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    free_aligned_buffer_64(row);
-  }
-  return 0;
+  return RGBToI444Matrix(src_raw, src_stride_raw,
+                         dst_y, dst_stride_y, dst_u, dst_stride_u,
+                         dst_v, dst_stride_v, width, height,
+                         &kArgbJPEGConstants, RAWToARGBRow);
 }
 
 // Convert RGB565 to I420.
@@ -3638,77 +3758,30 @@ int RGB565ToI420(const uint8_t* src_rgb565,
                  int dst_stride_v,
                  int width,
                  int height) {
-  int y;
-  void (*RGB565ToUVMatrixRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
-                              uint8_t* dst_u, uint8_t* dst_v, int width,
-                              const struct ArgbConstants* c) =
-      RGB565ToUVMatrixRow_C;
-  void (*RGB565ToYMatrixRow)(const uint8_t* src_rgb565, uint8_t* dst_y,
-                             int width, const struct ArgbConstants* c) =
-      RGB565ToYMatrixRow_C;
-
-#if defined(HAS_RGB565TOYMATRIXROW_AVX2)
+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                          int width) = RGB565ToARGBRow_C;
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    RGB565ToYMatrixRow = RGB565ToYMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      RGB565ToYMatrixRow = RGB565ToYMatrixRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOUVMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGB565ToUVMatrixRow = RGB565ToUVMatrixRow_Any_AVX2;
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      RGB565ToUVMatrixRow = RGB565ToUVMatrixRow_AVX2;
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_RGB565TOUVMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB565ToUVMatrixRow = RGB565ToUVMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      RGB565ToUVMatrixRow = RGB565ToUVMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGB565TOYMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB565ToYMatrixRow = RGB565ToYMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      RGB565ToYMatrixRow = RGB565ToYMatrixRow_NEON;
-    }
-  }
-#endif
-
-  if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0 ||
-      height == INT_MIN) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_rgb565 = src_rgb565 + (ptrdiff_t)(height - 1) * src_stride_rgb565;
-    src_stride_rgb565 = -src_stride_rgb565;
-  }
-
-  for (y = 0; y < height - 1; y += 2) {
-    RGB565ToUVMatrixRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width,
-                        &kArgbI601Constants);
-    RGB565ToYMatrixRow(src_rgb565, dst_y, width, &kArgbI601Constants);
-    RGB565ToYMatrixRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y,
-                       width, &kArgbI601Constants);
-    src_rgb565 += src_stride_rgb565 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    RGB565ToYMatrixRow(src_rgb565, dst_y, width, &kArgbI601Constants);
-    RGB565ToUVMatrixRow(src_rgb565, 0, dst_u, dst_v, width,
-                        &kArgbI601Constants);
-  }
-  return 0;
+  return RGBToI420Matrix(src_rgb565, src_stride_rgb565,
+                         dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                         dst_stride_v, width, height,
+                         &kArgbI601Constants, RGB565ToARGBRow);
 }
+
 // Convert ARGB1555 to I420.
 LIBYUV_API
 int ARGB1555ToI420(const uint8_t* src_argb1555,
@@ -3721,77 +3794,30 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
                    int dst_stride_v,
                    int width,
                    int height) {
-  int y;
-  void (*ARGB1555ToUVMatrixRow)(
-      const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u,
-      uint8_t* dst_v, int width, const struct ArgbConstants* c) =
-      ARGB1555ToUVMatrixRow_C;
-  void (*ARGB1555ToYMatrixRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
-                               int width, const struct ArgbConstants* c) =
-      ARGB1555ToYMatrixRow_C;
-
-#if defined(HAS_ARGB1555TOYMATRIXROW_AVX2)
+  void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                          int width) = ARGB1555ToARGBRow_C;
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB1555ToYMatrixRow = ARGB1555ToYMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGB1555ToYMatrixRow = ARGB1555ToYMatrixRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOUVMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB1555ToUVMatrixRow = ARGB1555ToUVMatrixRow_Any_AVX2;
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      ARGB1555ToUVMatrixRow = ARGB1555ToUVMatrixRow_AVX2;
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGB1555TOUVMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB1555ToUVMatrixRow = ARGB1555ToUVMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB1555ToUVMatrixRow = ARGB1555ToUVMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGB1555TOYMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB1555ToYMatrixRow = ARGB1555ToYMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB1555ToYMatrixRow = ARGB1555ToYMatrixRow_NEON;
-    }
-  }
-#endif
-
-  if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb1555 = src_argb1555 + (ptrdiff_t)(height - 1) * src_stride_argb1555;
-    src_stride_argb1555 = -src_stride_argb1555;
-  }
-
-  for (y = 0; y < height - 1; y += 2) {
-    ARGB1555ToUVMatrixRow(src_argb1555, src_stride_argb1555, dst_u, dst_v,
-                          width, &kArgbI601Constants);
-    ARGB1555ToYMatrixRow(src_argb1555, dst_y, width, &kArgbI601Constants);
-    ARGB1555ToYMatrixRow(src_argb1555 + src_stride_argb1555,
-                         dst_y + dst_stride_y, width, &kArgbI601Constants);
-    src_argb1555 += src_stride_argb1555 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    ARGB1555ToYMatrixRow(src_argb1555, dst_y, width, &kArgbI601Constants);
-    ARGB1555ToUVMatrixRow(src_argb1555, 0, dst_u, dst_v, width,
-                          &kArgbI601Constants);
-  }
-  return 0;
+  return RGBToI420Matrix(src_argb1555, src_stride_argb1555,
+                         dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                         dst_stride_v, width, height,
+                         &kArgbI601Constants, ARGB1555ToARGBRow);
 }
+
 // Convert ARGB4444 to I420.
 LIBYUV_API
 int ARGB4444ToI420(const uint8_t* src_argb4444,
@@ -3804,90 +3830,62 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
                    int dst_stride_v,
                    int width,
                    int height) {
-  int y;
-  void (*ARGB4444ToUVMatrixRow)(
-      const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u,
-      uint8_t* dst_v, int width, const struct ArgbConstants* c) =
-      ARGB4444ToUVMatrixRow_C;
-  void (*ARGB4444ToYMatrixRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
-                               int width, const struct ArgbConstants* c) =
-      ARGB4444ToYMatrixRow_C;
-
-#if defined(HAS_ARGB4444TOYMATRIXROW_AVX2)
+  void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                          int width) = ARGB4444ToARGBRow_C;
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB4444ToYMatrixRow = ARGB4444ToYMatrixRow_Any_AVX2;
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LASX;
     if (IS_ALIGNED(width, 32)) {
-      ARGB4444ToYMatrixRow = ARGB4444ToYMatrixRow_AVX2;
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_LASX;
     }
   }
 #endif
-#if defined(HAS_ARGB4444TOUVMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGB4444ToUVMatrixRow = ARGB4444ToUVMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB4444ToUVMatrixRow = ARGB4444ToUVMatrixRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOUVMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB4444ToUVMatrixRow = ARGB4444ToUVMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB4444ToUVMatrixRow = ARGB4444ToUVMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGB4444TOYMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGB4444ToYMatrixRow = ARGB4444ToYMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGB4444ToYMatrixRow = ARGB4444ToYMatrixRow_NEON;
-    }
-  }
-#endif
-
-  if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb4444 = src_argb4444 + (ptrdiff_t)(height - 1) * src_stride_argb4444;
-    src_stride_argb4444 = -src_stride_argb4444;
-  }
-
-  for (y = 0; y < height - 1; y += 2) {
-    ARGB4444ToUVMatrixRow(src_argb4444, src_stride_argb4444, dst_u, dst_v,
-                          width, &kArgbI601Constants);
-    ARGB4444ToYMatrixRow(src_argb4444, dst_y, width, &kArgbI601Constants);
-    ARGB4444ToYMatrixRow(src_argb4444 + src_stride_argb4444,
-                         dst_y + dst_stride_y, width, &kArgbI601Constants);
-    src_argb4444 += src_stride_argb4444 * 2;
-    dst_y += dst_stride_y * 2;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    ARGB4444ToYMatrixRow(src_argb4444, dst_y, width, &kArgbI601Constants);
-    ARGB4444ToUVMatrixRow(src_argb4444, 0, dst_u, dst_v, width,
-                          &kArgbI601Constants);
-  }
-  return 0;
+  return RGBToI420Matrix(src_argb4444, src_stride_argb4444,
+                         dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                         dst_stride_v, width, height,
+                         &kArgbI601Constants, ARGB4444ToARGBRow);
 }
-// Convert RGB24 to J400.
-LIBYUV_API
-int RGB24ToJ400(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_yj,
-                int dst_stride_yj,
-                int width,
-                int height) {
+
+
+
+static int RGBToI400Matrix(const uint8_t* src_rgb,
+                           int src_stride_rgb,
+                           uint8_t* dst_y,
+                           int dst_stride_y,
+                           int width,
+                           int height,
+                           const struct ArgbConstants* argbconstants,
+                           void (*RGBToARGBRow)(const uint8_t* src_rgb,
+                                                uint8_t* dst_argb,
+                                                int width)) {
   int y;
-  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RGB24ToARGBRow_C;
   void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
                            const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
+
 #if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
@@ -3950,79 +3948,16 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
   }
 #endif
 
-  if (!src_rgb24 || !dst_yj || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_rgb || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_rgb24 = src_rgb24 + (ptrdiff_t)(height - 1) * src_stride_rgb24;
-    src_stride_rgb24 = -src_stride_rgb24;
+    src_rgb = src_rgb + (height - 1) * src_stride_rgb;
+    src_stride_rgb = -src_stride_rgb;
   }
-  // Coalesce rows.
-  if (src_stride_rgb24 == width * 3 && dst_stride_yj == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
-    width *= height;
-    height = 1;
-    src_stride_rgb24 = dst_stride_yj = 0;
-  }
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_SVE2)
-  if (TestCpuFlag(kCpuHasSVE2)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_SVE2;
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_LASX;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_RVV)
-  if (TestCpuFlag(kCpuHasRVV)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_RVV;
-  }
-#endif
+
   {
     // Allocate 1 row of ARGB.
     const int row_size = (width * 4 + 31) & ~31;
@@ -4031,10 +3966,10 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
       return 1;
 
     for (y = 0; y < height; ++y) {
-      RGB24ToARGBRow(src_rgb24, row, width);
-      ARGBToYMatrixRow(row, dst_yj, width, &kArgbJPEGConstants);
-      src_rgb24 += src_stride_rgb24;
-      dst_yj += dst_stride_yj;
+      RGBToARGBRow(src_rgb, row, width);
+      ARGBToYMatrixRow(row, dst_y, width, argbconstants);
+      src_rgb += src_stride_rgb;
+      dst_y += dst_stride_y;
     }
     free_aligned_buffer_64(row);
   }
@@ -4049,89 +3984,8 @@ int RAWToJ400(const uint8_t* src_raw,
               int dst_stride_yj,
               int width,
               int height) {
-  int y;
-  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RAWToARGBRow_C;
-  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
-                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
-#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
-  if (TestCpuFlag(kCpuHasNeonDotProd)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_RVV)
-  if (TestCpuFlag(kCpuHasRVV)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
-  }
-#endif
-
-  if (!src_raw || !dst_yj || width <= 0 || height == 0 || height == INT_MIN) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_raw = src_raw + (ptrdiff_t)(height - 1) * src_stride_raw;
-    src_stride_raw = -src_stride_raw;
-  }
-  // Coalesce rows.
-  if (src_stride_raw == width * 3 && dst_stride_yj == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
-    width *= height;
-    height = 1;
-    src_stride_raw = dst_stride_yj = 0;
-  }
-
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                          int width) = RAWToARGBRow_C;
 #if defined(HAS_RAWTOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@@ -4190,23 +4044,9 @@ int RAWToJ400(const uint8_t* src_raw,
     RAWToARGBRow = RAWToARGBRow_RVV;
   }
 #endif
-
-  {
-    // Allocate 1 row of ARGB.
-    const int row_size = (width * 4 + 31) & ~31;
-    align_buffer_64(row, row_size);
-    if (!row)
-      return 1;
-
-    for (y = 0; y < height; ++y) {
-      RAWToARGBRow(src_raw, row, width);
-      ARGBToYMatrixRow(row, dst_yj, width, &kArgbJPEGConstants);
-      src_raw += src_stride_raw;
-      dst_yj += dst_stride_yj;
-    }
-    free_aligned_buffer_64(row);
-  }
-  return 0;
+  return RGBToI400Matrix(src_raw, src_stride_raw,
+                         dst_yj, dst_stride_yj, width, height,
+                         &kArgbJPEGConstants, RAWToARGBRow);
 }
 
 // Convert Android420 to I420.
@@ -4246,19 +4086,18 @@ static int Biplanar16bitTo8bit(const uint16_t* src_y,
                                int subsample_x,
                                int subsample_y,
                                int depth) {
-  if ((!src_y && dst_y) || !src_uv || !dst_uv || width <= 0 || height == 0 ||
-      height == INT_MIN) {
-    return -1;
-  }
   int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
   int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
   int scale = 1 << (24 - depth);
+  if ((!src_y && dst_y) || !src_uv || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     uv_height = -uv_height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_uv = src_uv + (ptrdiff_t)(uv_height - 1) * src_stride_uv;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (uv_height - 1) * src_stride_uv;
     src_stride_y = -src_stride_y;
     src_stride_uv = -src_stride_uv;
   }
@@ -4313,19 +4152,19 @@ static int Planar8bitTo8bit(const uint8_t* src_y,
                             int bias_y,
                             int scale_uv,
                             int bias_uv) {
-  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
-    return -1;
-  }
   int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
   int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     uv_height = -uv_height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(uv_height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(uv_height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (uv_height - 1) * src_stride_u;
+    src_v = src_v + (uv_height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -4365,7 +4204,82 @@ int J420ToI420(const uint8_t* src_y,
                           1, 220, 16, 225, 16);
 }
 
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_yj,
+                int dst_stride_yj,
+                int width,
+                int height) {
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                          int width) = RGB24ToARGBRow_C;
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_SVE2;
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_RVV;
+  }
+#endif
+  return RGBToI400Matrix(src_rgb24, src_stride_rgb24,
+                         dst_yj, dst_stride_yj, width, height,
+                         &kArgbJPEGConstants, RGB24ToARGBRow);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
+
+
+
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 3844e9691..7672a6692 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -11,7 +11,6 @@
 #include "libyuv/convert_argb.h"
 
 #include <assert.h>
-#include <limits.h>
 
 #include "libyuv/convert_from_argb.h"
 #include "libyuv/cpu_id.h"
@@ -34,14 +33,13 @@ int ARGBCopy(const uint8_t* src_argb,
              int dst_stride_argb,
              int width,
              int height) {
-  if (!src_argb || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
 
@@ -69,14 +67,13 @@ int I420ToARGBMatrix(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToARGBRow_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I422TOARGBROW_SSSE3)
@@ -319,20 +316,18 @@ int I422ToARGBMatrix(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToARGBRow_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
   if (src_stride_y == width && src_stride_u * 2 == width &&
-      src_stride_v * 2 == width && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+      src_stride_v * 2 == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
@@ -575,19 +570,18 @@ int I444ToARGBMatrix(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I444ToARGBRow_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
   if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
-      dst_stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
@@ -813,19 +807,18 @@ int I444ToRGB24Matrix(const uint8_t* src_y,
                          const struct YuvConstants* yuvconstants, int width) =
       I444ToRGB24Row_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb24 = dst_rgb24 + (ptrdiff_t)(height - 1) * dst_stride_rgb24;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
     dst_stride_rgb24 = -dst_stride_rgb24;
   }
   // Coalesce rows.
   if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
-      dst_stride_rgb24 == width * 3 && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_rgb24 == width * 3) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_rgb24 = 0;
@@ -945,14 +938,13 @@ int I010ToAR30Matrix(const uint16_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I210ToAR30Row_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_ar30 = dst_ar30 + (ptrdiff_t)(height - 1) * dst_stride_ar30;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
     dst_stride_ar30 = -dst_stride_ar30;
   }
 #if defined(HAS_I210TOAR30ROW_NEON)
@@ -1124,14 +1116,13 @@ int I012ToAR30Matrix(const uint16_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I212ToAR30Row_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_ar30 = dst_ar30 + (ptrdiff_t)(height - 1) * dst_stride_ar30;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
     dst_stride_ar30 = -dst_stride_ar30;
   }
 #if defined(HAS_I212TOAR30ROW_SSSE3)
@@ -1201,14 +1192,13 @@ int I210ToAR30Matrix(const uint16_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I210ToAR30Row_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_ar30 = dst_ar30 + (ptrdiff_t)(height - 1) * dst_stride_ar30;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
     dst_stride_ar30 = -dst_stride_ar30;
   }
 #if defined(HAS_I210TOAR30ROW_NEON)
@@ -1375,14 +1365,13 @@ int I410ToAR30Matrix(const uint16_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I410ToAR30Row_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_ar30 = dst_ar30 + (ptrdiff_t)(height - 1) * dst_stride_ar30;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
     dst_stride_ar30 = -dst_stride_ar30;
   }
 #if defined(HAS_I410TOAR30ROW_NEON)
@@ -1448,14 +1437,13 @@ int I010ToARGBMatrix(const uint16_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I210ToARGBRow_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I210TOARGBROW_SSSE3)
@@ -1631,14 +1619,13 @@ int I012ToARGBMatrix(const uint16_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I212ToARGBRow_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I212TOARGBROW_SSSE3)
@@ -1706,14 +1693,13 @@ int I210ToARGBMatrix(const uint16_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I210ToARGBRow_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I210TOARGBROW_SSSE3)
@@ -1886,14 +1872,13 @@ int I410ToARGBMatrix(const uint16_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I410ToARGBRow_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I410TOARGBROW_SSSE3)
@@ -1955,14 +1940,13 @@ int P010ToARGBMatrix(const uint16_t* src_y,
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
   assert(yuvconstants);
-  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_P210TOARGBROW_SSSE3)
@@ -2025,14 +2009,13 @@ int P210ToARGBMatrix(const uint16_t* src_y,
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
   assert(yuvconstants);
-  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_P210TOARGBROW_SSSE3)
@@ -2093,14 +2076,13 @@ int P010ToAR30Matrix(const uint16_t* src_y,
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
   assert(yuvconstants);
-  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_ar30 = dst_ar30 + (ptrdiff_t)(height - 1) * dst_stride_ar30;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
     dst_stride_ar30 = -dst_stride_ar30;
   }
 #if defined(HAS_P210TOAR30ROW_SSSE3)
@@ -2163,14 +2145,13 @@ int P210ToAR30Matrix(const uint16_t* src_y,
       const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
   assert(yuvconstants);
-  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_ar30 = dst_ar30 + (ptrdiff_t)(height - 1) * dst_stride_ar30;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
     dst_stride_ar30 = -dst_stride_ar30;
   }
 #if defined(HAS_P210TOAR30ROW_SSSE3)
@@ -2242,13 +2223,13 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y,
                            int width) = ARGBAttenuateRow_C;
   assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
@@ -2395,13 +2376,13 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y,
                            int width) = ARGBAttenuateRow_C;
   assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
@@ -2546,13 +2527,13 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y,
                            int width) = ARGBAttenuateRow_C;
   assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I444ALPHATOARGBROW_SSSE3)
@@ -2810,13 +2791,13 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y,
                            int width) = ARGBAttenuateRow_C;
   assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I210ALPHATOARGBROW_NEON)
@@ -2942,13 +2923,13 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y,
                            int width) = ARGBAttenuateRow_C;
   assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I210ALPHATOARGBROW_NEON)
@@ -3072,13 +3053,13 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y,
                            int width) = ARGBAttenuateRow_C;
   assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I410ALPHATOARGBROW_NEON)
@@ -3190,18 +3171,17 @@ int I400ToARGBMatrix(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I400ToARGBRow_C;
   assert(yuvconstants);
-  if (!src_y || !dst_argb || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_y == width && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
@@ -3285,23 +3265,29 @@ int J400ToARGB(const uint8_t* src_y,
   int y;
   void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) =
       J400ToARGBRow_C;
-  if (!src_y || !dst_argb || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
+    src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
   }
-
+#if defined(HAS_J400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_J400TOARGBROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     J400ToARGBRow = J400ToARGBRow_Any_AVX2;
@@ -3310,14 +3296,6 @@ int J400ToARGB(const uint8_t* src_y,
     }
   }
 #endif
-#if defined(HAS_J400TOARGBROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    J400ToARGBRow = J400ToARGBRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 32)) {
-      J400ToARGBRow = J400ToARGBRow_AVX512BW;
-    }
-  }
-#endif
 #if defined(HAS_J400TOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     J400ToARGBRow = J400ToARGBRow_Any_NEON;
@@ -3460,19 +3438,17 @@ int ARGBToBGRA(const uint8_t* src_argb,
   int y;
   void (*ARGBToBGRARow)(const uint8_t* src_argb, uint8_t* dst_bgra, int width) =
       ARGBToBGRARow_C;
-  if (!src_argb || !dst_bgra || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_bgra || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_bgra == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_bgra == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_bgra = 0;
@@ -3503,19 +3479,17 @@ int ARGBToABGR(const uint8_t* src_argb,
   int y;
   void (*ARGBToABGRRow)(const uint8_t* src_argb, uint8_t* dst_abgr, int width) =
       ARGBToABGRRow_C;
-  if (!src_argb || !dst_abgr || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_abgr || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_abgr == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_abgr == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_abgr = 0;
@@ -3558,19 +3532,17 @@ int RGBAToARGB(const uint8_t* src_rgba,
   int y;
   void (*RGBAToARGBRow)(const uint8_t* src_rgba, uint8_t* dst_argb, int width) =
       RGBAToARGBRow_C;
-  if (!src_rgba || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_rgba || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_rgba = src_rgba + (ptrdiff_t)(height - 1) * src_stride_rgba;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
     src_stride_rgba = -src_stride_rgba;
   }
   // Coalesce rows.
-  if (src_stride_rgba == width * 4 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_rgba == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_rgba = dst_stride_argb = 0;
@@ -3601,19 +3573,17 @@ int AR64ToAB64(const uint16_t* src_ar64,
   int y;
   void (*AR64ToAB64Row)(const uint16_t* src_ar64, uint16_t* dst_ab64,
                         int width) = AR64ToAB64Row_C;
-  if (!src_ar64 || !dst_ab64 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_ar64 || !dst_ab64 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_ar64 = src_ar64 + (ptrdiff_t)(height - 1) * src_stride_ar64;
+    src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
     src_stride_ar64 = -src_stride_ar64;
   }
   // Coalesce rows.
-  if (src_stride_ar64 == width * 4 && dst_stride_ab64 == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_ar64 == width * 4 && dst_stride_ab64 == width * 4) {
     width *= height;
     height = 1;
     src_stride_ar64 = dst_stride_ab64 = 0;
@@ -3645,19 +3615,17 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
   int y;
   void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RGB24ToARGBRow_C;
-  if (!src_rgb24 || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_rgb24 = src_rgb24 + (ptrdiff_t)(height - 1) * src_stride_rgb24;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
     src_stride_rgb24 = -src_stride_rgb24;
   }
   // Coalesce rows.
-  if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_rgb24 = dst_stride_argb = 0;
@@ -3720,7 +3688,7 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
     RGB24ToARGBRow = RGB24ToARGBRow_RVV;
   }
 #endif
-  for (y = 0; y < height; ++y) {
+for (y = 0; y < height; ++y) {
     RGB24ToARGBRow(src_rgb24, dst_argb, width);
     src_rgb24 += src_stride_rgb24;
     dst_argb += dst_stride_argb;
@@ -3739,18 +3707,17 @@ int RAWToARGB(const uint8_t* src_raw,
   int y;
   void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RAWToARGBRow_C;
-  if (!src_raw || !dst_argb || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_raw || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_raw = src_raw + (ptrdiff_t)(height - 1) * src_stride_raw;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
     src_stride_raw = -src_stride_raw;
   }
   // Coalesce rows.
-  if (src_stride_raw == width * 3 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_raw = dst_stride_argb = 0;
@@ -3833,18 +3800,17 @@ int RAWToRGBA(const uint8_t* src_raw,
   int y;
   void (*RAWToRGBARow)(const uint8_t* src_rgb, uint8_t* dst_rgba, int width) =
       RAWToRGBARow_C;
-  if (!src_raw || !dst_rgba || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_raw || !dst_rgba || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_raw = src_raw + (ptrdiff_t)(height - 1) * src_stride_raw;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
     src_stride_raw = -src_stride_raw;
   }
   // Coalesce rows.
-  if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4) {
     width *= height;
     height = 1;
     src_stride_raw = dst_stride_rgba = 0;
@@ -3895,24 +3861,29 @@ int RGB565ToARGB(const uint8_t* src_rgb565,
   int y;
   void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb,
                           int width) = RGB565ToARGBRow_C;
-  if (!src_rgb565 || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_rgb565 = src_rgb565 + (ptrdiff_t)(height - 1) * src_stride_rgb565;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
     src_stride_rgb565 = -src_stride_rgb565;
   }
   // Coalesce rows.
-  if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_rgb565 = dst_stride_argb = 0;
   }
-
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_RGB565TOARGBROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
@@ -3965,24 +3936,29 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555,
   int y;
   void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb,
                             int width) = ARGB1555ToARGBRow_C;
-  if (!src_argb1555 || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb1555 = src_argb1555 + (ptrdiff_t)(height - 1) * src_stride_argb1555;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
     src_stride_argb1555 = -src_stride_argb1555;
   }
   // Coalesce rows.
-  if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb1555 = dst_stride_argb = 0;
   }
-
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGB1555TOARGBROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
@@ -4040,24 +4016,29 @@ int ARGB4444ToARGB(const uint8_t* src_argb4444,
   int y;
   void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb,
                             int width) = ARGB4444ToARGBRow_C;
-  if (!src_argb4444 || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb4444 = src_argb4444 + (ptrdiff_t)(height - 1) * src_stride_argb4444;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
     src_stride_argb4444 = -src_stride_argb4444;
   }
   // Coalesce rows.
-  if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb4444 = dst_stride_argb = 0;
   }
-
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGB4444TOARGBROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
@@ -4108,19 +4089,17 @@ int AR30ToARGB(const uint8_t* src_ar30,
                int width,
                int height) {
   int y;
-  if (!src_ar30 || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_ar30 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_ar30 = src_ar30 + (ptrdiff_t)(height - 1) * src_stride_ar30;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
     src_stride_ar30 = -src_stride_ar30;
   }
   // Coalesce rows.
-  if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_ar30 = dst_stride_argb = 0;
@@ -4142,19 +4121,17 @@ int AR30ToABGR(const uint8_t* src_ar30,
                int width,
                int height) {
   int y;
-  if (!src_ar30 || !dst_abgr || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_ar30 = src_ar30 + (ptrdiff_t)(height - 1) * src_stride_ar30;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
     src_stride_ar30 = -src_stride_ar30;
   }
   // Coalesce rows.
-  if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) {
     width *= height;
     height = 1;
     src_stride_ar30 = dst_stride_abgr = 0;
@@ -4176,19 +4153,17 @@ int AR30ToAB30(const uint8_t* src_ar30,
                int width,
                int height) {
   int y;
-  if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_ar30 = src_ar30 + (ptrdiff_t)(height - 1) * src_stride_ar30;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
     src_stride_ar30 = -src_stride_ar30;
   }
   // Coalesce rows.
-  if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) {
     width *= height;
     height = 1;
     src_stride_ar30 = dst_stride_ab30 = 0;
@@ -4212,19 +4187,17 @@ int AR64ToARGB(const uint16_t* src_ar64,
   int y;
   void (*AR64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb,
                         int width) = AR64ToARGBRow_C;
-  if (!src_ar64 || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_ar64 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_ar64 = src_ar64 + (ptrdiff_t)(height - 1) * src_stride_ar64;
+    src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
     src_stride_ar64 = -src_stride_ar64;
   }
   // Coalesce rows.
-  if (src_stride_ar64 == width * 4 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_ar64 == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_ar64 = dst_stride_argb = 0;
@@ -4278,19 +4251,17 @@ int AB64ToARGB(const uint16_t* src_ab64,
   int y;
   void (*AB64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb,
                         int width) = AB64ToARGBRow_C;
-  if (!src_ab64 || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_ab64 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_ab64 = src_ab64 + (ptrdiff_t)(height - 1) * src_stride_ab64;
+    src_ab64 = src_ab64 + (height - 1) * src_stride_ab64;
     src_stride_ab64 = -src_stride_ab64;
   }
   // Coalesce rows.
-  if (src_stride_ab64 == width * 4 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_ab64 == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_ab64 = dst_stride_argb = 0;
@@ -4349,14 +4320,13 @@ int NV12ToARGBMatrix(const uint8_t* src_y,
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
   assert(yuvconstants);
-  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_NV12TOARGBROW_SSSE3)
@@ -4442,14 +4412,13 @@ int NV21ToARGBMatrix(const uint8_t* src_y,
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
   assert(yuvconstants);
-  if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_NV21TOARGBROW_SSSE3)
@@ -4594,14 +4563,13 @@ int NV12ToRGB24Matrix(const uint8_t* src_y,
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;
   assert(yuvconstants);
-  if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb24 = dst_rgb24 + (ptrdiff_t)(height - 1) * dst_stride_rgb24;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
     dst_stride_rgb24 = -dst_stride_rgb24;
   }
 #if defined(HAS_NV12TORGB24ROW_NEON)
@@ -4671,14 +4639,13 @@ int NV21ToRGB24Matrix(const uint8_t* src_y,
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;
   assert(yuvconstants);
-  if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb24 = dst_rgb24 + (ptrdiff_t)(height - 1) * dst_stride_rgb24;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
     dst_stride_rgb24 = -dst_stride_rgb24;
   }
 #if defined(HAS_NV21TORGB24ROW_NEON)
@@ -4802,14 +4769,13 @@ int NV21ToYUV24(const uint8_t* src_y,
   int y;
   void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu,
                          uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C;
-  if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_yuv24 = dst_yuv24 + (ptrdiff_t)(height - 1) * dst_stride_yuv24;
+    dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24;
     dst_stride_yuv24 = -dst_stride_yuv24;
   }
 #if defined(HAS_NV21TOYUV24ROW_NEON)
@@ -4860,19 +4826,17 @@ int YUY2ToARGBMatrix(const uint8_t* src_yuy2,
   void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants, int width) =
       YUY2ToARGBRow_C;
-  if (!src_yuy2 || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_yuy2 = src_yuy2 + (ptrdiff_t)(height - 1) * src_stride_yuy2;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
     src_stride_yuy2 = -src_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_yuy2 = dst_stride_argb = 0;
@@ -4952,19 +4916,17 @@ int UYVYToARGBMatrix(const uint8_t* src_uyvy,
   void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants, int width) =
       UYVYToARGBRow_C;
-  if (!src_uyvy || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_uyvy = src_uyvy + (ptrdiff_t)(height - 1) * src_stride_uyvy;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
     src_stride_uyvy = -src_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_uyvy = dst_stride_argb = 0;
@@ -5066,15 +5028,14 @@ int Android420ToARGBMatrix(const uint8_t* src_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 
@@ -5097,10 +5058,7 @@ int Android420ToARGBMatrix(const uint8_t* src_y,
   }
 
   // General case fallback creates NV12
-  const uint64_t uv_size = (uint64_t)halfwidth * 2 * halfheight;
-  if (uv_size > SIZE_MAX)
-    return 1;
-  align_buffer_64(plane_uv, (size_t)uv_size);
+  align_buffer_64(plane_uv, halfwidth * 2 * halfheight);
   if (!plane_uv)
     return 1;
   dst_uv = plane_uv;
@@ -5173,14 +5131,13 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToRGBARow_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgba = dst_rgba + (ptrdiff_t)(height - 1) * dst_stride_rgba;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
     dst_stride_rgba = -dst_stride_rgba;
   }
 #if defined(HAS_I422TORGBAROW_SSSE3)
@@ -5301,14 +5258,13 @@ int NV12ToRGB565Matrix(const uint8_t* src_y,
       const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
       const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
   assert(yuvconstants);
-  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb565 = dst_rgb565 + (ptrdiff_t)(height - 1) * dst_stride_rgb565;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
     dst_stride_rgb565 = -dst_stride_rgb565;
   }
 #if defined(HAS_NV12TORGB565ROW_SSSE3)
@@ -5397,14 +5353,13 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants, int width) =
       I422ToRGBARow_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgba = dst_rgba + (ptrdiff_t)(height - 1) * dst_stride_rgba;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
     dst_stride_rgba = -dst_stride_rgba;
   }
 #if defined(HAS_I422TORGBAROW_SSSE3)
@@ -5530,14 +5485,13 @@ int I420ToRGB24Matrix(const uint8_t* src_y,
                          const struct YuvConstants* yuvconstants, int width) =
       I422ToRGB24Row_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb24 = dst_rgb24 + (ptrdiff_t)(height - 1) * dst_stride_rgb24;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
     dst_stride_rgb24 = -dst_stride_rgb24;
   }
 #if defined(HAS_I422TORGB24ROW_SSSE3)
@@ -5556,22 +5510,6 @@ int I420ToRGB24Matrix(const uint8_t* src_y,
     }
   }
 #endif
-#if defined(HAS_I422TORGB24ROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_AVX512BW;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToRGB24Row = I422ToRGB24Row_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_AVX512VBMI)
-  if (TestCpuFlag(kCpuHasAVX512VBMI)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_AVX512VBMI;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToRGB24Row = I422ToRGB24Row_AVX512VBMI;
-    }
-  }
-#endif
 #if defined(HAS_I422TORGB24ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
@@ -5751,14 +5689,13 @@ int I422ToRGB24Matrix(const uint8_t* src_y,
                          const struct YuvConstants* yuvconstants, int width) =
       I422ToRGB24Row_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb24 = dst_rgb24 + (ptrdiff_t)(height - 1) * dst_stride_rgb24;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
     dst_stride_rgb24 = -dst_stride_rgb24;
   }
 #if defined(HAS_I422TORGB24ROW_SSSE3)
@@ -5777,22 +5714,6 @@ int I422ToRGB24Matrix(const uint8_t* src_y,
     }
   }
 #endif
-#if defined(HAS_I422TORGB24ROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_AVX512BW;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToRGB24Row = I422ToRGB24Row_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_I422TORGB24ROW_AVX512VBMI)
-  if (TestCpuFlag(kCpuHasAVX512VBMI)) {
-    I422ToRGB24Row = I422ToRGB24Row_Any_AVX512VBMI;
-    if (IS_ALIGNED(width, 32)) {
-      I422ToRGB24Row = I422ToRGB24Row_AVX512VBMI;
-    }
-  }
-#endif
 #if defined(HAS_I422TORGB24ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
@@ -5897,13 +5818,13 @@ int I420ToARGB1555(const uint8_t* src_y,
                             const struct YuvConstants* yuvconstants,
                             int width) = I422ToARGB1555Row_C;
   if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb1555 = dst_argb1555 + (ptrdiff_t)(height - 1) * dst_stride_argb1555;
+    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
     dst_stride_argb1555 = -dst_stride_argb1555;
   }
 #if defined(HAS_I422TOARGB1555ROW_SSSE3)
@@ -5988,13 +5909,13 @@ int I420ToARGB4444(const uint8_t* src_y,
                             const struct YuvConstants* yuvconstants,
                             int width) = I422ToARGB4444Row_C;
   if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb4444 = dst_argb4444 + (ptrdiff_t)(height - 1) * dst_stride_argb4444;
+    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
     dst_stride_argb4444 = -dst_stride_argb4444;
   }
 #if defined(HAS_I422TOARGB4444ROW_SSSE3)
@@ -6080,14 +6001,13 @@ int I420ToRGB565Matrix(const uint8_t* src_y,
                           const struct YuvConstants* yuvconstants, int width) =
       I422ToRGB565Row_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb565 = dst_rgb565 + (ptrdiff_t)(height - 1) * dst_stride_rgb565;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
     dst_stride_rgb565 = -dst_stride_rgb565;
   }
 #if defined(HAS_I422TORGB565ROW_SSSE3)
@@ -6223,14 +6143,13 @@ int I422ToRGB565Matrix(const uint8_t* src_y,
                           const struct YuvConstants* yuvconstants, int width) =
       I422ToRGB565Row_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb565 = dst_rgb565 + (ptrdiff_t)(height - 1) * dst_stride_rgb565;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
     dst_stride_rgb565 = -dst_stride_rgb565;
   }
 #if defined(HAS_I422TORGB565ROW_SSSE3)
@@ -6337,14 +6256,13 @@ int I420ToRGB565Dither(const uint8_t* src_y,
   void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
                                 uint32_t dither4, int width) =
       ARGBToRGB565DitherRow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb565 = dst_rgb565 + (ptrdiff_t)(height - 1) * dst_stride_rgb565;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
     dst_stride_rgb565 = -dst_stride_rgb565;
   }
   if (!dither4x4) {
@@ -6501,14 +6419,13 @@ int I420ToAR30Matrix(const uint8_t* src_y,
       I422ToAR30Row_C;
 
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_ar30 = dst_ar30 + (ptrdiff_t)(height - 1) * dst_stride_ar30;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
     dst_stride_ar30 = -dst_stride_ar30;
   }
 
@@ -6649,14 +6566,13 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y,
   void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
                              int dst_width) = ScaleRowUp2_Linear_Any_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I444TOARGBROW_SSSE3)
@@ -6799,14 +6715,13 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y,
   void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
                              int dst_width) = ScaleRowUp2_Linear_Any_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I444TOARGBROW_SSSE3)
@@ -6926,14 +6841,13 @@ static int I420ToRGB24MatrixBilinear(const uint8_t* src_y,
   void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
                              int dst_width) = ScaleRowUp2_Linear_Any_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb24 = dst_rgb24 + (ptrdiff_t)(height - 1) * dst_stride_rgb24;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
     dst_stride_rgb24 = -dst_stride_rgb24;
   }
 #if defined(HAS_I444TORGB24ROW_SSSE3)
@@ -7079,14 +6993,13 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
   void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
                                 int dst_width) = ScaleRowUp2_Linear_16_Any_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_ar30 = dst_ar30 + (ptrdiff_t)(height - 1) * dst_stride_ar30;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
     dst_stride_ar30 = -dst_stride_ar30;
   }
 #if defined(HAS_I410TOAR30ROW_NEON)
@@ -7204,14 +7117,13 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y,
   void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
                                 int dst_width) = ScaleRowUp2_Linear_16_Any_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_ar30 = dst_ar30 + (ptrdiff_t)(height - 1) * dst_stride_ar30;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
     dst_stride_ar30 = -dst_stride_ar30;
   }
 #if defined(HAS_I410TOAR30ROW_NEON)
@@ -7308,14 +7220,13 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y,
   void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
                                 int dst_width) = ScaleRowUp2_Linear_16_Any_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I410TOARGBROW_SSSE3)
@@ -7432,14 +7343,13 @@ static int I210ToARGBMatrixLinear(const uint16_t* src_y,
   void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr,
                                 int dst_width) = ScaleRowUp2_Linear_16_Any_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I410TOARGBROW_SSSE3)
@@ -7545,13 +7455,13 @@ static int I420AlphaToARGBMatrixBilinear(
                              int dst_width) = ScaleRowUp2_Linear_Any_C;
   assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I444ALPHATOARGBROW_SSSE3)
@@ -7765,13 +7675,13 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y,
                              int dst_width) = ScaleRowUp2_Linear_Any_C;
   assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I444ALPHATOARGBROW_SSSE3)
@@ -7950,13 +7860,13 @@ static int I010AlphaToARGBMatrixBilinear(
                                 int dst_width) = ScaleRowUp2_Linear_16_Any_C;
   assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I410ALPHATOARGBROW_NEON)
@@ -8144,13 +8054,13 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y,
                              int dst_width) = ScaleRowUp2_Linear_16_Any_C;
   assert(yuvconstants);
   if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I410ALPHATOARGBROW_NEON)
@@ -8292,14 +8202,13 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y,
       const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
       ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C;
   assert(yuvconstants);
-  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_P410TOARGBROW_SSSE3)
@@ -8404,14 +8313,13 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y,
   void (*ScaleRowUp2_Linear)(const uint16_t* src_uv, uint16_t* dst_uv,
                              int dst_width) = ScaleUVRowUp2_Linear_16_Any_C;
   assert(yuvconstants);
-  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_P410TOARGBROW_SSSE3)
@@ -8502,14 +8410,13 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y,
       const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr,
       ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C;
   assert(yuvconstants);
-  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_ar30 = dst_ar30 + (ptrdiff_t)(height - 1) * dst_stride_ar30;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
     dst_stride_ar30 = -dst_stride_ar30;
   }
 #if defined(HAS_P410TOAR30ROW_SSSE3)
@@ -8614,14 +8521,13 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y,
   void (*ScaleRowUp2_Linear)(const uint16_t* src_uv, uint16_t* dst_uv,
                              int dst_width) = ScaleUVRowUp2_Linear_16_Any_C;
   assert(yuvconstants);
-  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_ar30 = dst_ar30 + (ptrdiff_t)(height - 1) * dst_stride_ar30;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
     dst_stride_ar30 = -dst_stride_ar30;
   }
 #if defined(HAS_P410TOAR30ROW_SSSE3)
@@ -8714,14 +8620,13 @@ static int I422ToRGB24MatrixLinear(const uint8_t* src_y,
   void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr,
                              int dst_width) = ScaleRowUp2_Linear_Any_C;
   assert(yuvconstants);
-  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb24 = dst_rgb24 + (ptrdiff_t)(height - 1) * dst_stride_rgb24;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
     dst_stride_rgb24 = -dst_stride_rgb24;
   }
 #if defined(HAS_I444TORGB24ROW_SSSE3)
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 40ca02190..5cf88fa2d 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -10,8 +10,6 @@
 
 #include "libyuv/convert_from.h"
 
-#include <limits.h>
-
 #include "libyuv/basic_types.h"
 #include "libyuv/convert.h"  // For I420Copy
 #include "libyuv/cpu_id.h"
@@ -89,16 +87,16 @@ int I420ToI010(const uint8_t* src_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -134,16 +132,16 @@ int I420ToI012(const uint8_t* src_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -228,7 +226,7 @@ int I010ToI410(const uint16_t* src_y,
                int height) {
   int r;
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
 
@@ -265,7 +263,7 @@ int I210ToI410(const uint16_t* src_y,
                int height) {
   int r;
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
 
@@ -301,7 +299,7 @@ int I422ToI444(const uint8_t* src_y,
                int height) {
   int r;
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
 
@@ -326,7 +324,7 @@ int I400Copy(const uint8_t* src_y,
              int dst_stride_y,
              int width,
              int height) {
-  if (!src_y || !dst_y || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
@@ -348,20 +346,18 @@ int I422ToYUY2(const uint8_t* src_y,
   void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
                         const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
       I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_yuy2 = dst_yuy2 + (ptrdiff_t)(height - 1) * dst_stride_yuy2;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
   // Coalesce rows.
   if (src_stride_y == width && src_stride_u * 2 == width &&
-      src_stride_v * 2 == width && dst_stride_yuy2 == width * 2 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+      src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
@@ -416,14 +412,13 @@ int I420ToYUY2(const uint8_t* src_y,
   void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
                         const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
       I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_yuy2 = dst_yuy2 + (ptrdiff_t)(height - 1) * dst_stride_yuy2;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
 #if defined(HAS_I422TOYUY2ROW_SSE2)
@@ -497,20 +492,18 @@ int I422ToUYVY(const uint8_t* src_y,
   void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
                         const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
       I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_uyvy = dst_uyvy + (ptrdiff_t)(height - 1) * dst_stride_uyvy;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
     dst_stride_uyvy = -dst_stride_uyvy;
   }
   // Coalesce rows.
   if (src_stride_y == width && src_stride_u * 2 == width &&
-      src_stride_v * 2 == width && dst_stride_uyvy == width * 2 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+      src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
@@ -581,14 +574,13 @@ int I420ToUYVY(const uint8_t* src_y,
   void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
                         const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
       I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_uyvy = dst_uyvy + (ptrdiff_t)(height - 1) * dst_stride_uyvy;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
     dst_stride_uyvy = -dst_stride_uyvy;
   }
 #if defined(HAS_I422TOUYVYROW_SSE2)
@@ -663,16 +655,16 @@ int I420ToNV12(const uint8_t* src_y,
   int halfwidth = (width + 1) / 2;
   int halfheight = (height + 1) / 2;
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_uv || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -718,8 +710,7 @@ int ConvertFromI420(const uint8_t* y,
                     uint32_t fourcc) {
   uint32_t format = CanonicalFourCC(fourcc);
   int r = 0;
-  if (!y || !u || !v || !dst_sample || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
     return -1;
   }
   switch (format) {
@@ -791,7 +782,7 @@ int ConvertFromI420(const uint8_t* y,
       break;
     case FOURCC_NV12: {
       int dst_y_stride = dst_sample_stride ? dst_sample_stride : width;
-      uint8_t* dst_uv = dst_sample + (ptrdiff_t)dst_y_stride * height;
+      uint8_t* dst_uv = dst_sample + dst_y_stride * height;
       r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                      dst_sample_stride ? dst_sample_stride : width, dst_uv,
                      dst_sample_stride ? dst_sample_stride : width, width,
@@ -800,7 +791,7 @@ int ConvertFromI420(const uint8_t* y,
     }
     case FOURCC_NV21: {
       int dst_y_stride = dst_sample_stride ? dst_sample_stride : width;
-      uint8_t* dst_vu = dst_sample + (ptrdiff_t)dst_y_stride * height;
+      uint8_t* dst_vu = dst_sample + dst_y_stride * height;
       r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                      dst_sample_stride ? dst_sample_stride : width, dst_vu,
                      dst_sample_stride ? dst_sample_stride : width, width,
@@ -816,11 +807,11 @@ int ConvertFromI420(const uint8_t* y,
       uint8_t* dst_u;
       uint8_t* dst_v;
       if (format == FOURCC_YV12) {
-        dst_v = dst_sample + (ptrdiff_t)dst_sample_stride * height;
-        dst_u = dst_v + (ptrdiff_t)halfstride * halfheight;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * halfheight;
       } else {
-        dst_u = dst_sample + (ptrdiff_t)dst_sample_stride * height;
-        dst_v = dst_u + (ptrdiff_t)halfstride * halfheight;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * halfheight;
       }
       r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                    dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
@@ -834,11 +825,11 @@ int ConvertFromI420(const uint8_t* y,
       uint8_t* dst_u;
       uint8_t* dst_v;
       if (format == FOURCC_YV16) {
-        dst_v = dst_sample + (ptrdiff_t)dst_sample_stride * height;
-        dst_u = dst_v + (ptrdiff_t)halfstride * height;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * height;
       } else {
-        dst_u = dst_sample + (ptrdiff_t)dst_sample_stride * height;
-        dst_v = dst_u + (ptrdiff_t)halfstride * height;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * height;
       }
       r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                      dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
@@ -851,11 +842,11 @@ int ConvertFromI420(const uint8_t* y,
       uint8_t* dst_u;
       uint8_t* dst_v;
       if (format == FOURCC_YV24) {
-        dst_v = dst_sample + (ptrdiff_t)dst_sample_stride * height;
-        dst_u = dst_v + (ptrdiff_t)dst_sample_stride * height;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + dst_sample_stride * height;
       } else {
-        dst_u = dst_sample + (ptrdiff_t)dst_sample_stride * height;
-        dst_v = dst_u + (ptrdiff_t)dst_sample_stride * height;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + dst_sample_stride * height;
       }
       r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                      dst_sample_stride, dst_u, dst_sample_stride, dst_v,
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index 77b3851d4..a139c1d20 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -10,8 +10,6 @@
 
 #include "libyuv/convert_from_argb.h"
 
-#include <limits.h>
-
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
@@ -54,9 +52,10 @@ int ARGBToI444Matrix(const uint8_t* src_argb,
   int y;
   void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
                            const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
-  void (*ARGBToUV444MatrixRow)(
-      const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width,
-      const struct ArgbConstants* c) = ARGBToUV444MatrixRow_C;
+  void (*ARGBToUV444MatrixRow)(const uint8_t* src_argb, uint8_t* dst_u,
+                               uint8_t* dst_v, int width,
+                               const struct ArgbConstants* c) =
+ARGBToUV444MatrixRow_C;
 
 #if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -153,13 +152,13 @@ int ARGBToI444Matrix(const uint8_t* src_argb,
   }
 #endif
   if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
 
@@ -209,7 +208,7 @@ int ARGBToI422Matrix(const uint8_t* src_argb,
   void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                             uint8_t* dst_u, uint8_t* dst_v, int width,
                             const struct ArgbConstants* c) =
-      ARGBToUVMatrixRow_C;
+ARGBToUVMatrixRow_C;
 
 #if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -273,40 +272,10 @@ int ARGBToI422Matrix(const uint8_t* src_argb,
   }
 #endif
 
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-  if (TestCpuFlag(kCpuHasSVE2)) {
-    if (IS_ALIGNED(width, 2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SME)
-  if (TestCpuFlag(kCpuHasSME)) {
-    if (IS_ALIGNED(width, 2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
-    }
-  }
-#endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3;
     }
   }
@@ -326,15 +295,61 @@ int ARGBToI422Matrix(const uint8_t* src_argb,
       ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW;
     }
   }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LASX;
+    }
+  }
 #endif
   if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
 
@@ -358,9 +373,8 @@ int ARGBToNV12(const uint8_t* src_argb,
                int dst_stride_uv,
                int width,
                int height) {
-  return ARGBToNV12Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y,
-                          dst_uv, dst_stride_uv, &kArgbI601Constants, width,
-                          height);
+  return ARGBToNV12Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_uv,
+                          dst_stride_uv, &kArgbI601Constants, width, height);
 }
 
 LIBYUV_API
@@ -380,7 +394,7 @@ int ARGBToNV12Matrix(const uint8_t* src_argb,
   void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                             uint8_t* dst_u, uint8_t* dst_v, int width,
                             const struct ArgbConstants* c) =
-      ARGBToUVMatrixRow_C;
+ARGBToUVMatrixRow_C;
 
 #if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -444,467 +458,14 @@ int ARGBToNV12Matrix(const uint8_t* src_argb,
   }
 #endif
 
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-  if (TestCpuFlag(kCpuHasSVE2)) {
-    if (IS_ALIGNED(width, 2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SME)
-  if (TestCpuFlag(kCpuHasSME)) {
-    if (IS_ALIGNED(width, 2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
-    }
-  }
-#endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
-    }
-  }
-#endif
-  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
-                     uint8_t* dst_uv, int width) = MergeUVRow_C;
-  if (!src_argb || !dst_y || !dst_uv || !argbconstants || width <= 0 ||
-      height == 0 || height == INT_MIN) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeUVRow = MergeUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow = MergeUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow = MergeUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    MergeUVRow = MergeUVRow_Any_AVX512BW;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow = MergeUVRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow = MergeUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow = MergeUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_SME)
-  if (TestCpuFlag(kCpuHasSME)) {
-    MergeUVRow = MergeUVRow_SME;
-  }
-#endif
-#if defined(HAS_MERGEUVROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    MergeUVRow = MergeUVRow_Any_LSX;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow = MergeUVRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_RVV)
-  if (TestCpuFlag(kCpuHasRVV)) {
-    MergeUVRow = MergeUVRow_RVV;
-  }
-#endif
-
-  // Allocate a rows of uv.
-  align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-  uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
-  if (!row_u)
-    return 1;
-
-  for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVMatrixRow(src_argb, src_stride_argb, row_u, row_v, width,
-                      argbconstants);
-    MergeUVRow(row_u, row_v, dst_uv, halfwidth);
-    ARGBToYMatrixRow(src_argb, dst_y, width, argbconstants);
-    ARGBToYMatrixRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width,
-                     argbconstants);
-    src_argb += src_stride_argb * 2;
-    dst_y += dst_stride_y * 2;
-    dst_uv += dst_stride_uv;
-  }
-  if (height & 1) {
-    ARGBToUVMatrixRow(src_argb, 0, row_u, row_v, width, argbconstants);
-    MergeUVRow(row_u, row_v, dst_uv, halfwidth);
-    ARGBToYMatrixRow(src_argb, dst_y, width, argbconstants);
-  }
-  free_aligned_buffer_64(row_u);
-  return 0;
-}
-
-int ARGBToNV21Matrix(const uint8_t* src_argb,
-                     int src_stride_argb,
-                     uint8_t* dst_y,
-                     int dst_stride_y,
-                     uint8_t* dst_vu,
-                     int dst_stride_uv,
-                     const struct ArgbConstants* argbconstants,
-                     int width,
-                     int height) {
-  int y;
-  int halfwidth = (width + 1) >> 1;
-  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
-                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
-  void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
-                            uint8_t* dst_u, uint8_t* dst_v, int width,
-                            const struct ArgbConstants* c) =
-      ARGBToUVMatrixRow_C;
-
-#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
-  if (TestCpuFlag(kCpuHasNeonDotProd)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_RVV)
-  if (TestCpuFlag(kCpuHasRVV)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
-  }
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-  if (TestCpuFlag(kCpuHasSVE2)) {
-    if (IS_ALIGNED(width, 2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SME)
-  if (TestCpuFlag(kCpuHasSME)) {
-    if (IS_ALIGNED(width, 2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
-    }
-  }
-#endif
-  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
-                     uint8_t* dst_vu, int width) = MergeUVRow_C;
-  if (!src_argb || !dst_y || !dst_vu || !argbconstants || width <= 0 ||
-      height == 0 || height == INT_MIN) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    MergeUVRow = MergeUVRow_Any_SSE2;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow = MergeUVRow_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow = MergeUVRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    MergeUVRow = MergeUVRow_Any_AVX512BW;
-    if (IS_ALIGNED(halfwidth, 32)) {
-      MergeUVRow = MergeUVRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow = MergeUVRow_Any_NEON;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow = MergeUVRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_SME)
-  if (TestCpuFlag(kCpuHasSME)) {
-    MergeUVRow = MergeUVRow_SME;
-  }
-#endif
-#if defined(HAS_MERGEUVROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    MergeUVRow = MergeUVRow_Any_LSX;
-    if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow = MergeUVRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_MERGEUVROW_RVV)
-  if (TestCpuFlag(kCpuHasRVV)) {
-    MergeUVRow = MergeUVRow_RVV;
-  }
-#endif
-
-  // Allocate a rows of uv.
-  align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-  uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
-  if (!row_u)
-    return 1;
-
-  for (y = 0; y < height - 1; y += 2) {
-    ARGBToUVMatrixRow(src_argb, src_stride_argb, row_u, row_v, width,
-                      argbconstants);
-    MergeUVRow(row_u, row_v, dst_vu, halfwidth);
-    ARGBToYMatrixRow(src_argb, dst_y, width, argbconstants);
-    ARGBToYMatrixRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width,
-                     argbconstants);
-    src_argb += src_stride_argb * 2;
-    dst_y += dst_stride_y * 2;
-    dst_vu += dst_stride_uv;
-  }
-  if (height & 1) {
-    ARGBToUVMatrixRow(src_argb, 0, row_u, row_v, width, argbconstants);
-    MergeUVRow(row_u, row_v, dst_vu, halfwidth);
-    ARGBToYMatrixRow(src_argb, dst_y, width, argbconstants);
-  }
-  free_aligned_buffer_64(row_u);
-  return 0;
-}
-LIBYUV_API
-int ARGBToI400Matrix(const uint8_t* src_argb,
-                     int src_stride_argb,
-                     uint8_t* dst_y,
-                     int dst_stride_y,
-                     const struct ArgbConstants* constants,
-                     int width,
-                     int height) {
-  int y;
-  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
-                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
-  if (!src_argb || !dst_y || !constants || width <= 0 || height == 0 ||
-      height == INT_MIN) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
-  if (TestCpuFlag(kCpuHasNeonDotProd)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
-    }
-  }
-#endif
-
-  for (y = 0; y < height; ++y) {
-    ARGBToYMatrixRow(src_argb, dst_y, width, constants);
-    src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
-  }
-  return 0;
-}
-LIBYUV_API
-int ARGBToYUY2Matrix(const uint8_t* src_argb,
-                     int src_stride_argb,
-                     uint8_t* dst_yuy2,
-                     int dst_stride_yuy2,
-                     const struct ArgbConstants* constants,
-                     int width,
-                     int height) {
-  int y;
-  void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
-                            uint8_t* dst_u, uint8_t* dst_v, int width,
-                            const struct ArgbConstants* c) =
-      ARGBToUVMatrixRow_C;
-  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
-                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
-  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
-                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
-      I422ToYUY2Row_C;
-
-  if (!src_argb || !dst_yuy2 || !constants || width <= 0 || height == 0 ||
-      height == INT_MIN) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    dst_yuy2 = dst_yuy2 + (ptrdiff_t)(height - 1) * dst_stride_yuy2;
-    dst_stride_yuy2 = -dst_stride_yuy2;
-  }
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
-    }
-  }
-#endif
 #if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2;
@@ -929,6 +490,975 @@ int ARGBToYUY2Matrix(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LASX;
+    }
+  }
+#endif
+  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+                     uint8_t* dst_uv, int width) = MergeUVRow_C;
+  if (!src_argb || !dst_y || !dst_uv || !argbconstants || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MergeUVRow = MergeUVRow_SME;
+  }
+#endif
+#if defined(HAS_MERGEUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MergeUVRow = MergeUVRow_Any_LSX;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow = MergeUVRow_RVV;
+  }
+#endif
+
+  // Allocate a rows of uv.
+  align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+  uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+  if (!row_u)
+    return 1;
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVMatrixRow(src_argb, src_stride_argb, row_u, row_v, width,
+                      argbconstants);
+    MergeUVRow(row_u, row_v, dst_uv, halfwidth);
+    ARGBToYMatrixRow(src_argb, dst_y, width, argbconstants);
+    ARGBToYMatrixRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width,
+                      argbconstants);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    ARGBToUVMatrixRow(src_argb, 0, row_u, row_v, width, argbconstants);
+    MergeUVRow(row_u, row_v, dst_uv, halfwidth);
+    ARGBToYMatrixRow(src_argb, dst_y, width, argbconstants);
+  }
+  free_aligned_buffer_64(row_u);
+  return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ARGBToNV21(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
+                            uint8_t* dst_u, uint8_t* dst_v, int width,
+                            const struct ArgbConstants* c) =
+      ARGBToUVMatrixRow_C;
+  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
+                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
+  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_vu, int width) = MergeUVRow_C;
+  if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MergeUVRow = MergeUVRow_SME;
+  }
+#endif
+#if defined(HAS_MERGEUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MergeUVRow = MergeUVRow_Any_LSX;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow = MergeUVRow_RVV;
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+    if (!row_u)
+      return 1;
+
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVMatrixRow(src_argb, src_stride_argb, row_u, row_v, width, &kArgbI601Constants);
+      MergeUVRow(row_v, row_u, dst_vu, halfwidth);
+      ARGBToYMatrixRow(src_argb, dst_y, width, &kArgbI601Constants);
+      ARGBToYMatrixRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width, &kArgbI601Constants);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_vu += dst_stride_vu;
+    }
+    if (height & 1) {
+      ARGBToUVMatrixRow(src_argb, 0, row_u, row_v, width, &kArgbI601Constants);
+      MergeUVRow(row_v, row_u, dst_vu, halfwidth);
+      ARGBToYMatrixRow(src_argb, dst_y, width, &kArgbI601Constants);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int ABGRToNV12(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+      ABGRToYRow_C;
+  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_uv, int width) = MergeUVRow_C;
+  if (!src_abgr || !dst_y || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToYRow = ABGRToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ABGRToYRow = ABGRToYRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ABGRToYRow = ABGRToYRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ABGRToUVRow = ABGRToUVRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVRow = ABGRToUVRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SME;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVRow = ABGRToUVRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYRow = ABGRToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYRow = ABGRToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYRow = ABGRToYRow_RVV;
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MergeUVRow = MergeUVRow_SME;
+  }
+#endif
+#if defined(HAS_MERGEUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MergeUVRow = MergeUVRow_Any_LSX;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow = MergeUVRow_RVV;
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+    if (!row_u)
+      return 1;
+
+    for (y = 0; y < height - 1; y += 2) {
+      ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+      MergeUVRow(row_u, row_v, dst_uv, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+      ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+      src_abgr += src_stride_abgr * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+      MergeUVRow(row_u, row_v, dst_uv, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ABGRToNV21(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+      ABGRToYRow_C;
+  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_vu, int width) = MergeUVRow_C;
+  if (!src_abgr || !dst_y || !dst_vu || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToYRow = ABGRToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ABGRToUVRow = ABGRToUVRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVRow = ABGRToUVRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SME;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVRow = ABGRToUVRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYRow = ABGRToYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYRow = ABGRToYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYRow = ABGRToYRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYRow = ABGRToYRow_RVV;
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MergeUVRow = MergeUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(halfwidth, 64)) {
+      MergeUVRow = MergeUVRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MergeUVRow = MergeUVRow_SME;
+  }
+#endif
+#if defined(HAS_MERGEUVROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    MergeUVRow = MergeUVRow_Any_LSX;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow = MergeUVRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    MergeUVRow = MergeUVRow_RVV;
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+    if (!row_u)
+      return 1;
+
+    for (y = 0; y < height - 1; y += 2) {
+      ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+      MergeUVRow(row_v, row_u, dst_vu, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+      ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+      src_abgr += src_stride_abgr * 2;
+      dst_y += dst_stride_y * 2;
+      dst_vu += dst_stride_vu;
+    }
+    if (height & 1) {
+      ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+      MergeUVRow(row_v, row_u, dst_vu, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
+                            uint8_t* dst_u, uint8_t* dst_v, int width,
+                            const struct ArgbConstants* c) =
+      ARGBToUVMatrixRow_C;
+  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
+                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+
+  if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
@@ -945,8 +1475,25 @@ int ARGBToYUY2Matrix(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_LASX;
+    }
+  }
+#endif
 
   {
+    // Allocate a rows of yuv.
     align_buffer_64(row_y, ((width + 63) & ~63) * 2);
     uint8_t* row_u = row_y + ((width + 63) & ~63);
     uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
@@ -954,8 +1501,8 @@ int ARGBToYUY2Matrix(const uint8_t* src_argb,
       return 1;
 
     for (y = 0; y < height; ++y) {
-      ARGBToUVMatrixRow(src_argb, 0, row_u, row_v, width, constants);
-      ARGBToYMatrixRow(src_argb, row_y, width, constants);
+      ARGBToUVMatrixRow(src_argb, 0, row_u, row_v, width, &kArgbI601Constants);
+      ARGBToYMatrixRow(src_argb, row_y, width, &kArgbI601Constants);
       I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
       src_argb += src_stride_argb;
       dst_yuy2 += dst_stride_yuy2;
@@ -966,14 +1513,14 @@ int ARGBToYUY2Matrix(const uint8_t* src_argb,
   return 0;
 }
 
+// Convert ARGB to UYVY.
 LIBYUV_API
-int ARGBToUYVYMatrix(const uint8_t* src_argb,
-                     int src_stride_argb,
-                     uint8_t* dst_uyvy,
-                     int dst_stride_uyvy,
-                     const struct ArgbConstants* constants,
-                     int width,
-                     int height) {
+int ARGBToUYVY(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
   int y;
   void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                             uint8_t* dst_u, uint8_t* dst_v, int width,
@@ -985,15 +1532,29 @@ int ARGBToUYVYMatrix(const uint8_t* src_argb,
                         const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
       I422ToUYVYRow_C;
 
-  if (!src_argb || !dst_uyvy || !constants || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
+  // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_uyvy = dst_uyvy + (ptrdiff_t)(height - 1) * dst_stride_uyvy;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
     dst_stride_uyvy = -dst_stride_uyvy;
   }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYMATRIXROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
@@ -1018,6 +1579,43 @@ int ARGBToUYVYMatrix(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2;
@@ -1042,6 +1640,52 @@ int ARGBToUYVYMatrix(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
@@ -1058,8 +1702,25 @@ int ARGBToUYVYMatrix(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_LASX;
+    }
+  }
+#endif
 
   {
+    // Allocate a rows of yuv.
     align_buffer_64(row_y, ((width + 63) & ~63) * 2);
     uint8_t* row_u = row_y + ((width + 63) & ~63);
     uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
@@ -1067,8 +1728,8 @@ int ARGBToUYVYMatrix(const uint8_t* src_argb,
       return 1;
 
     for (y = 0; y < height; ++y) {
-      ARGBToUVMatrixRow(src_argb, 0, row_u, row_v, width, constants);
-      ARGBToYMatrixRow(src_argb, row_y, width, constants);
+      ARGBToUVMatrixRow(src_argb, 0, row_u, row_v, width, &kArgbI601Constants);
+      ARGBToYMatrixRow(src_argb, row_y, width, &kArgbI601Constants);
       I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
       src_argb += src_stride_argb;
       dst_uyvy += dst_stride_uyvy;
@@ -1079,75 +1740,16 @@ int ARGBToUYVYMatrix(const uint8_t* src_argb,
   return 0;
 }
 
-// Same as NV12 but U and V swapped.
-LIBYUV_API
-int ARGBToNV21(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  return ARGBToNV21Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y,
-                          dst_vu, dst_stride_vu, &kArgbI601Constants, width,
-                          height);
-}
-
-LIBYUV_API
-int ABGRToNV12(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  return ARGBToNV12Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y,
-                          dst_uv, dst_stride_uv, &kAbgrI601Constants, width,
-                          height);
-}
-
-// Same as NV12 but U and V swapped.
-LIBYUV_API
-int ABGRToNV21(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  return ARGBToNV21Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y,
-                          dst_vu, dst_stride_vu, &kAbgrI601Constants, width,
-                          height);
-}
-
-// Convert ARGB to YUY2.
-LIBYUV_API
-int ARGBToYUY2(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_yuy2,
-               int dst_stride_yuy2,
-               int width,
-               int height) {
-  return ARGBToYUY2Matrix(src_argb, src_stride_argb, dst_yuy2, dst_stride_yuy2,
-                          &kArgbI601Constants, width, height);
-}
-
-// Convert ARGB to UYVY.
-LIBYUV_API
-int ARGBToUYVY(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_uyvy,
-               int dst_stride_uyvy,
-               int width,
-               int height) {
-  return ARGBToUYVYMatrix(src_argb, src_stride_argb, dst_uyvy, dst_stride_uyvy,
-                          &kArgbI601Constants, width, height);
-}
-
 // Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400Matrix(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     const struct ArgbConstants* argbconstants,
+                     int width,
+                     int height);
+
 LIBYUV_API
 int ARGBToI400(const uint8_t* src_argb,
                int src_stride_argb,
@@ -1158,6 +1760,100 @@ int ARGBToI400(const uint8_t* src_argb,
   return ARGBToI400Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y,
                           &kArgbI601Constants, width, height);
 }
+LIBYUV_API
+int ARGBToI400Matrix(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     const struct ArgbConstants* argbconstants,
+                     int width,
+                     int height) {
+  int y;
+  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
+                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
+  if (!src_argb || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = 0;
+  }
+#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYMatrixRow(src_argb, dst_y, width, argbconstants);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
 
 #ifndef __riscv
 // Shuffle table for converting ARGB to RGBA.
@@ -1187,18 +1883,16 @@ int ARGBToRGBA(const uint8_t* src_argb,
   int y;
   void (*ARGBToRGBARow)(const uint8_t* src_argb, uint8_t* dst_rgba, int width) =
       ARGBToRGBARow_C;
-  if (!src_argb || !dst_rgba || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_rgba || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_rgba == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_rgba == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_rgba = 0;
@@ -1230,18 +1924,16 @@ int ARGBToRGB24(const uint8_t* src_argb,
   int y;
   void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
       ARGBToRGB24Row_C;
-  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_rgb24 = 0;
@@ -1324,17 +2016,16 @@ int ARGBToRAW(const uint8_t* src_argb,
   int y;
   void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
       ARGBToRAWRow_C;
-  if (!src_argb || !dst_raw || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_argb || !dst_raw || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_raw == width * 3 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_raw = 0;
@@ -1416,19 +2107,25 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
   void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
                                 uint32_t dither4, int width) =
       ARGBToRGB565DitherRow_C;
-  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   if (!dither4x4) {
     dither4x4 = kDither565_4x4;
   }
-
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
@@ -1489,23 +2186,28 @@ int ARGBToRGB565(const uint8_t* src_argb,
   int y;
   void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
                           int width) = ARGBToRGB565Row_C;
-  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_rgb565 = 0;
   }
-
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB565ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
@@ -1564,23 +2266,28 @@ int ARGBToARGB1555(const uint8_t* src_argb,
   int y;
   void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
                             int width) = ARGBToARGB1555Row_C;
-  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb1555 = 0;
   }
-
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOARGB1555ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
@@ -1633,23 +2340,28 @@ int ARGBToARGB4444(const uint8_t* src_argb,
   int y;
   void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
                             int width) = ARGBToARGB4444Row_C;
-  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb4444 = 0;
   }
-
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOARGB4444ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
@@ -1702,18 +2414,16 @@ int ABGRToAR30(const uint8_t* src_abgr,
   int y;
   void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) =
       ABGRToAR30Row_C;
-  if (!src_abgr || !dst_ar30 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_abgr = src_abgr + (ptrdiff_t)(height - 1) * src_stride_abgr;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
     src_stride_abgr = -src_stride_abgr;
   }
   // Coalesce rows.
-  if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) {
     width *= height;
     height = 1;
     src_stride_abgr = dst_stride_ar30 = 0;
@@ -1761,18 +2471,16 @@ int ARGBToAR30(const uint8_t* src_argb,
   int y;
   void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
       ARGBToAR30Row_C;
-  if (!src_argb || !dst_ar30 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_ar30 = 0;
@@ -1809,68 +2517,10 @@ int ARGBToAR30(const uint8_t* src_argb,
   return 0;
 }
 
-// ARGB little endian (bgra in memory) to J444
-LIBYUV_API
-int ARGBToJ444(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return ARGBToI444Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v,
-                          &kArgbJPEGConstants, width, height);
-}
 
-// Convert ARGB to J420. (JPeg full range I420).
-LIBYUV_API
-int ARGBToJ420(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return ARGBToI420Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v,
-                          &kArgbJPEGConstants, width, height);
-}
 
 // Convert ARGB to J422. (JPeg full range I422).
-LIBYUV_API
-int ARGBToJ422(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height) {
-  return ARGBToI422Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v,
-                          &kArgbJPEGConstants, width, height);
-}
 
-// Convert ARGB to J400.
-LIBYUV_API
-int ARGBToJ400(const uint8_t* src_argb,
-               int src_stride_argb,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               int width,
-               int height) {
-  return ARGBToI400Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y,
-                          &kArgbJPEGConstants, width, height);
-}
 
 // Convert RGBA to J400.
 LIBYUV_API
@@ -1883,17 +2533,16 @@ int RGBAToJ400(const uint8_t* src_rgba,
   int y;
   void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) =
       RGBAToYJRow_C;
-  if (!src_rgba || !dst_yj || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_rgba || !dst_yj || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_rgba = src_rgba + (ptrdiff_t)(height - 1) * src_stride_rgba;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
     src_stride_rgba = -src_stride_rgba;
   }
   // Coalesce rows.
-  if (src_stride_rgba == width * 4 && dst_stride_yj == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_rgba == width * 4 && dst_stride_yj == width) {
     width *= height;
     height = 1;
     src_stride_rgba = dst_stride_yj = 0;
@@ -1972,34 +2621,316 @@ int RGBAToJ400(const uint8_t* src_rgba,
 LIBYUV_API
 int ABGRToJ420(const uint8_t* src_abgr,
                int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height) {
-  return ARGBToI420Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v,
-                          &kAbgrJPEGConstants, width, height);
+  int y;
+  void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+      ABGRToUVJRow_C;
+  void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+      ABGRToYJRow_C;
+  if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToYJRow = ABGRToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ABGRToYJRow = ABGRToYJRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ABGRToYJRow = ABGRToYJRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVJRow = ABGRToUVJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ABGRToUVJRow = ABGRToUVJRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYJRow = ABGRToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ABGRToYJRow = ABGRToYJRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVJRow = ABGRToUVJRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_SME;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVJRow = ABGRToUVJRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYJRow = ABGRToYJRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ABGRToUVJRow(src_abgr, src_stride_abgr, dst_uj, dst_vj, width);
+    ABGRToYJRow(src_abgr, dst_yj, width);
+    ABGRToYJRow(src_abgr + src_stride_abgr, dst_yj + dst_stride_yj, width);
+    src_abgr += src_stride_abgr * 2;
+    dst_yj += dst_stride_yj * 2;
+    dst_uj += dst_stride_uj;
+    dst_vj += dst_stride_vj;
+  }
+  if (height & 1) {
+    ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width);
+    ABGRToYJRow(src_abgr, dst_yj, width);
+  }
+  return 0;
 }
 
 // Convert ABGR to J422. (JPeg full range I422).
 LIBYUV_API
 int ABGRToJ422(const uint8_t* src_abgr,
                int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
                int width,
                int height) {
-  return ARGBToI422Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v,
-                          &kAbgrJPEGConstants, width, height);
+  int y;
+  void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+      ABGRToUVJRow_C;
+  void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+      ABGRToYJRow_C;
+  if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+  // Coalesce rows.
+  if (src_stride_abgr == width * 4 && dst_stride_yj == width &&
+      dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_abgr = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0;
+  }
+#if defined(HAS_ABGRTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToYJRow = ABGRToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVJRow = ABGRToUVJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ABGRToUVJRow = ABGRToUVJRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYJRow = ABGRToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ABGRToYJRow = ABGRToYJRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVJRow = ABGRToUVJRow_NEON_I8MM;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_SVE2;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVJRow = ABGRToUVJRow_SVE2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVJROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    ABGRToUVJRow = ABGRToUVJRow_Any_SME;
+    if (IS_ALIGNED(width, 2)) {
+      ABGRToUVJRow = ABGRToUVJRow_SME;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYJRow = ABGRToYJRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width);
+    ABGRToYJRow(src_abgr, dst_yj, width);
+    src_abgr += src_stride_abgr;
+    dst_yj += dst_stride_yj;
+    dst_uj += dst_stride_uj;
+    dst_vj += dst_stride_vj;
+  }
+  return 0;
 }
 
 // Convert ABGR to J400.
@@ -2010,8 +2941,83 @@ int ABGRToJ400(const uint8_t* src_abgr,
                int dst_stride_yj,
                int width,
                int height) {
-  return ARGBToI400Matrix(src_abgr, src_stride_abgr, dst_yj, dst_stride_yj,
-                          &kAbgrJPEGConstants, width, height);
+  int y;
+  void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) =
+      ABGRToYJRow_C;
+  if (!src_abgr || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+  // Coalesce rows.
+  if (src_stride_abgr == width * 4 && dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_abgr = dst_stride_yj = 0;
+  }
+#if defined(HAS_ABGRTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToYJRow = ABGRToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToYJRow = ABGRToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYJRow = ABGRToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ABGRToYJRow = ABGRToYJRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYJRow = ABGRToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ABGRToYJRow = ABGRToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToYJRow = ABGRToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    ABGRToYJRow = ABGRToYJRow_RVV;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ABGRToYJRow(src_abgr, dst_yj, width);
+    src_abgr += src_stride_abgr;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
 }
 
 // Convert ARGB to AR64.
@@ -2025,19 +3031,17 @@ int ARGBToAR64(const uint8_t* src_argb,
   int y;
   void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
                         int width) = ARGBToAR64Row_C;
-  if (!src_argb || !dst_ar64 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_ar64 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_ar64 = 0;
@@ -2091,19 +3095,17 @@ int ARGBToAB64(const uint8_t* src_argb,
   int y;
   void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
                         int width) = ARGBToAB64Row_C;
-  if (!src_argb || !dst_ab64 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_ab64 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_ab64 = 0;
@@ -2168,7 +3170,7 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
   void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
                            const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
   void (*MergeUVRow)(const uint8_t* src_uj, const uint8_t* src_vj,
-                     uint8_t* dst_vu, int width) = MergeUVRow_C;
+                      uint8_t* dst_vu, int width) = MergeUVRow_C;
 #if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
@@ -2231,14 +3233,14 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
   }
 #endif
 
-  if (!src_raw || !dst_y || !dst_vu || !argbconstants || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+
+  if (!src_raw || !dst_y || !dst_vu || !argbconstants || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_raw = src_raw + (ptrdiff_t)(height - 1) * src_stride_raw;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
     src_stride_raw = -src_stride_raw;
   }
 
@@ -2300,6 +3302,30 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
     RAWToARGBRow = RAWToARGBRow_RVV;
   }
 #endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
@@ -2309,7 +3335,7 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
   }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
     ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
@@ -2330,27 +3356,19 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3;
+#if defined(HAS_ARGBTOUVMATRIXROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LSX;
     }
   }
 #endif
-#if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2;
+#if defined(HAS_ARGBTOUVMATRIXROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_LASX;
     if (IS_ALIGNED(width, 32)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_LASX;
     }
   }
 #endif
@@ -2427,8 +3445,7 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
       ARGBToUVMatrixRow(row, row_size, row_u, row_v, width, argbconstants);
       MergeUVRow(row_v, row_u, dst_vu, halfwidth);
       ARGBToYMatrixRow(row, dst_y, width, argbconstants);
-      ARGBToYMatrixRow(row + row_size, dst_y + dst_stride_y, width,
-                       argbconstants);
+      ARGBToYMatrixRow(row + row_size, dst_y + dst_stride_y, width, argbconstants);
       src_raw += src_stride_raw * 2;
       dst_y += dst_stride_y * 2;
       dst_vu += dst_stride_vu;
@@ -2486,7 +3503,74 @@ int RGB24ToNV12(const uint8_t* src_rgb24,
                          height);
 }
 
+
+
+// Convert ARGB to J444.
+LIBYUV_API
+int ARGBToJ444(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height) {
+  return ARGBToI444Matrix(src_argb, src_stride_argb, dst_yj, dst_stride_yj,
+                          dst_uj, dst_stride_uj, dst_vj, dst_stride_vj,
+                          &kArgbJPEGConstants, width, height);
+}
+
+// Convert ARGB to J422.
+LIBYUV_API
+int ARGBToJ422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height) {
+  return ARGBToI422Matrix(src_argb, src_stride_argb, dst_yj, dst_stride_yj,
+                          dst_uj, dst_stride_uj, dst_vj, dst_stride_vj,
+                          &kArgbJPEGConstants, width, height);
+}
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_uj,
+               int dst_stride_uj,
+               uint8_t* dst_vj,
+               int dst_stride_vj,
+               int width,
+               int height) {
+  return ARGBToI420Matrix(src_argb, src_stride_argb, dst_yj, dst_stride_yj,
+                          dst_uj, dst_stride_uj, dst_vj, dst_stride_vj,
+                          &kArgbJPEGConstants, width, height);
+}
+
+// Convert ARGB to J400.
+LIBYUV_API
+int ARGBToJ400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height) {
+  return ARGBToI400Matrix(src_argb, src_stride_argb, dst_yj, dst_stride_yj,
+                          &kArgbJPEGConstants, width, height);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
+
+
diff --git a/source/convert_to_argb.cc b/source/convert_to_argb.cc
index 720cb0984..72d21b042 100644
--- a/source/convert_to_argb.cc
+++ b/source/convert_to_argb.cc
@@ -11,7 +11,6 @@
 #include "libyuv/convert_argb.h"
 
 #include <limits.h>
-#include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
 
@@ -51,26 +50,12 @@ int ConvertToARGB(const uint8_t* sample,
                   int crop_height,
                   enum RotationMode rotation,
                   uint32_t fourcc) {
-  if (src_height == INT_MIN || crop_height == INT_MIN) {
-    return -1;
-  }
-
-  int abs_src_height = (src_height < 0) ? -src_height : src_height;
-  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
-
-  if (dst_argb == NULL || sample == NULL || src_width <= 0 ||
-      src_width > INT_MAX / 4 || crop_width <= 0 || crop_width > INT_MAX / 4 ||
-      src_height == 0 || crop_height == 0 || crop_x < 0 || crop_y < 0 ||
-      crop_width > src_width || crop_x > src_width - crop_width ||
-      abs_crop_height > abs_src_height ||
-      crop_y > abs_src_height - abs_crop_height) {
-    return -1;
-  }
-
   uint32_t format = CanonicalFourCC(fourcc);
   int aligned_src_width = (src_width + 1) & ~1;
   const uint8_t* src;
   const uint8_t* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
   int r = 0;
 
   // One pass rotation is available for some formats. For the rest, convert
@@ -83,8 +68,13 @@ int ConvertToARGB(const uint8_t* sample,
   uint8_t* dest_argb = dst_argb;
   int dest_dst_stride_argb = dst_stride_argb;
   uint8_t* rotate_buffer = NULL;
-  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
 
+  if (dst_argb == NULL || sample == NULL || src_width <= 0 ||
+      src_width > INT_MAX / 4 || crop_width <= 0 || crop_width > INT_MAX / 4 ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
   if (src_height < 0) {
     inv_crop_height = -inv_crop_height;
   }
@@ -106,97 +96,95 @@ int ConvertToARGB(const uint8_t* sample,
   switch (format) {
     // Single plane formats
     case FOURCC_YUY2:
-      src = sample + ((ptrdiff_t)aligned_src_width * crop_y + crop_x) * 2;
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
       r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
                      crop_width, inv_crop_height);
       break;
     case FOURCC_UYVY:
-      src = sample + ((ptrdiff_t)aligned_src_width * crop_y + crop_x) * 2;
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
       r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
                      crop_width, inv_crop_height);
       break;
     case FOURCC_24BG:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 3;
+      src = sample + (src_width * crop_y + crop_x) * 3;
       r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
                       inv_crop_height);
       break;
     case FOURCC_RAW:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 3;
+      src = sample + (src_width * crop_y + crop_x) * 3;
       r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
                     inv_crop_height);
       break;
     case FOURCC_ARGB:
       if (!need_buf && !rotation) {
-        src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+        src = sample + (src_width * crop_y + crop_x) * 4;
         r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb,
                        crop_width, inv_crop_height);
       }
       break;
     case FOURCC_BGRA:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
       r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
                      inv_crop_height);
       break;
     case FOURCC_ABGR:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
       r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
                      inv_crop_height);
       break;
     case FOURCC_RGBA:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
       r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
                      inv_crop_height);
       break;
     case FOURCC_AR30:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
       r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
                      inv_crop_height);
       break;
     case FOURCC_AB30:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
       r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
                      inv_crop_height);
       break;
     case FOURCC_RGBP:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 2;
+      src = sample + (src_width * crop_y + crop_x) * 2;
       r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                        crop_width, inv_crop_height);
       break;
     case FOURCC_RGBO:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 2;
+      src = sample + (src_width * crop_y + crop_x) * 2;
       r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                          crop_width, inv_crop_height);
       break;
     case FOURCC_R444:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 2;
+      src = sample + (src_width * crop_y + crop_x) * 2;
       r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                          crop_width, inv_crop_height);
       break;
     case FOURCC_I400:
-      src = sample + (ptrdiff_t)src_width * crop_y + crop_x;
+      src = sample + src_width * crop_y + crop_x;
       r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
                      inv_crop_height);
       break;
     case FOURCC_J400:
-      src = sample + (ptrdiff_t)src_width * crop_y + crop_x;
+      src = sample + src_width * crop_y + crop_x;
       r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
                      inv_crop_height);
       break;
 
     // Biplanar formats
     case FOURCC_NV12:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
-      src_uv = sample +
-               aligned_src_width * ((ptrdiff_t)abs_src_height + crop_y / 2) +
-               crop_x;
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv =
+          sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
       r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
                      dst_stride_argb, crop_width, inv_crop_height);
       break;
     case FOURCC_NV21:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
-      src_uv = sample +
-               aligned_src_width * ((ptrdiff_t)abs_src_height + crop_y / 2) +
-               crop_x;
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv =
+          sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
       // Call NV12 but with u and v parameters swapped.
       r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
                      dst_stride_argb, crop_width, inv_crop_height);
@@ -204,21 +192,21 @@ int ConvertToARGB(const uint8_t* sample,
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YV12: {
-      const uint8_t* src_y = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
       const uint8_t* src_u;
       const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
       if (format == FOURCC_YV12) {
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                ((ptrdiff_t)halfwidth * crop_y + crop_x) / 2;
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)halfheight + crop_y / 2) + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+                (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       } else {
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                ((ptrdiff_t)halfwidth * crop_y + crop_x) / 2;
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)halfheight + crop_y / 2) + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+                (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       }
       r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
@@ -228,12 +216,11 @@ int ConvertToARGB(const uint8_t* sample,
     case FOURCC_J420: {
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
-      const uint8_t* src_y = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
-      const uint8_t* src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                             ((ptrdiff_t)halfwidth * crop_y + crop_x) / 2;
-      const uint8_t* src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                             halfwidth * ((ptrdiff_t)halfheight + crop_y / 2) +
-                             crop_x / 2;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u = sample + src_width * abs_src_height +
+                             (halfwidth * crop_y + crop_x) / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
@@ -242,12 +229,11 @@ int ConvertToARGB(const uint8_t* sample,
     case FOURCC_H420: {
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
-      const uint8_t* src_y = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
-      const uint8_t* src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                             ((ptrdiff_t)halfwidth * crop_y + crop_x) / 2;
-      const uint8_t* src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                             halfwidth * ((ptrdiff_t)halfheight + crop_y / 2) +
-                             crop_x / 2;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u = sample + src_width * abs_src_height +
+                             (halfwidth * crop_y + crop_x) / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       r = H420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
@@ -256,12 +242,11 @@ int ConvertToARGB(const uint8_t* sample,
     case FOURCC_U420: {
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
-      const uint8_t* src_y = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
-      const uint8_t* src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                             ((ptrdiff_t)halfwidth * crop_y + crop_x) / 2;
-      const uint8_t* src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                             halfwidth * ((ptrdiff_t)halfheight + crop_y / 2) +
-                             crop_x / 2;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u = sample + src_width * abs_src_height +
+                             (halfwidth * crop_y + crop_x) / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
@@ -270,19 +255,19 @@ int ConvertToARGB(const uint8_t* sample,
     case FOURCC_I422:
     case FOURCC_YV16: {
       int halfwidth = (src_width + 1) / 2;
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
       const uint8_t* src_u;
       const uint8_t* src_v;
       if (format == FOURCC_YV16) {
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                (ptrdiff_t)halfwidth * crop_y + crop_x / 2;
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)abs_src_height + crop_y) + crop_x / 2;
+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       } else {
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                (ptrdiff_t)halfwidth * crop_y + crop_x / 2;
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)abs_src_height + crop_y) + crop_x / 2;
+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       }
       r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
@@ -291,12 +276,11 @@ int ConvertToARGB(const uint8_t* sample,
 
     case FOURCC_J422: {
       int halfwidth = (src_width + 1) / 2;
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
-      const uint8_t* src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                             (ptrdiff_t)halfwidth * crop_y + crop_x / 2;
-      const uint8_t* src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                             halfwidth * ((ptrdiff_t)abs_src_height + crop_y) +
-                             crop_x / 2;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u =
+          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
@@ -304,12 +288,11 @@ int ConvertToARGB(const uint8_t* sample,
 
     case FOURCC_H422: {
       int halfwidth = (src_width + 1) / 2;
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
-      const uint8_t* src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                             (ptrdiff_t)halfwidth * crop_y + crop_x / 2;
-      const uint8_t* src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                             halfwidth * ((ptrdiff_t)abs_src_height + crop_y) +
-                             crop_x / 2;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u =
+          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
@@ -317,12 +300,11 @@ int ConvertToARGB(const uint8_t* sample,
 
     case FOURCC_U422: {
       int halfwidth = (src_width + 1) / 2;
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
-      const uint8_t* src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                             (ptrdiff_t)halfwidth * crop_y + crop_x / 2;
-      const uint8_t* src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                             halfwidth * ((ptrdiff_t)abs_src_height + crop_y) +
-                             crop_x / 2;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u =
+          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
@@ -330,19 +312,15 @@ int ConvertToARGB(const uint8_t* sample,
 
     case FOURCC_I444:
     case FOURCC_YV24: {
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
       const uint8_t* src_u;
       const uint8_t* src_v;
       if (format == FOURCC_YV24) {
-        src_v =
-            sample + src_width * ((ptrdiff_t)abs_src_height + crop_y) + crop_x;
-        src_u = sample + src_width * ((ptrdiff_t)abs_src_height * 2 + crop_y) +
-                crop_x;
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       } else {
-        src_u =
-            sample + src_width * ((ptrdiff_t)abs_src_height + crop_y) + crop_x;
-        src_v = sample + src_width * ((ptrdiff_t)abs_src_height * 2 + crop_y) +
-                crop_x;
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       }
       r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
@@ -350,36 +328,33 @@ int ConvertToARGB(const uint8_t* sample,
     }
 
     case FOURCC_J444: {
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
-      const uint8_t* src_u =
-          sample + src_width * ((ptrdiff_t)abs_src_height + crop_y) + crop_x;
-      const uint8_t* src_v =
-          sample + src_width * ((ptrdiff_t)abs_src_height * 2 + crop_y) +
-          crop_x;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
 
     case FOURCC_H444: {
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
-      const uint8_t* src_u =
-          sample + src_width * ((ptrdiff_t)abs_src_height + crop_y) + crop_x;
-      const uint8_t* src_v =
-          sample + src_width * ((ptrdiff_t)abs_src_height * 2 + crop_y) +
-          crop_x;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
 
     case FOURCC_U444: {
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
-      const uint8_t* src_u =
-          sample + src_width * ((ptrdiff_t)abs_src_height + crop_y) + crop_x;
-      const uint8_t* src_v =
-          sample + src_width * ((ptrdiff_t)abs_src_height * 2 + crop_y) +
-          crop_x;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
                      dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
@@ -402,7 +377,7 @@ int ConvertToARGB(const uint8_t* sample,
     }
     free(rotate_buffer);
   } else if (rotation) {
-    src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+    src = sample + (src_width * crop_y + crop_x) * 4;
     r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
                    inv_crop_height, rotation);
   }
diff --git a/source/convert_to_i420.cc b/source/convert_to_i420.cc
index baa4a9494..aab071e1a 100644
--- a/source/convert_to_i420.cc
+++ b/source/convert_to_i420.cc
@@ -44,24 +44,12 @@ int ConvertToI420(const uint8_t* sample,
                   int crop_height,
                   enum RotationMode rotation,
                   uint32_t fourcc) {
-  if (src_height == INT_MIN || crop_height == INT_MIN) {
-    return -1;
-  }
-
-  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
-  const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
-
-  if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
-      src_width > INT_MAX / 4 || crop_width <= 0 || src_height == 0 ||
-      crop_height == 0 || crop_x < 0 || crop_y < 0 || crop_width > src_width ||
-      crop_x > src_width - crop_width || abs_crop_height > abs_src_height ||
-      crop_y > abs_src_height - abs_crop_height) {
-    return -1;
-  }
-
   uint32_t format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
   const uint8_t* src;
   const uint8_t* src_uv;
+  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
   int r = 0;
   LIBYUV_BOOL need_buf =
       (rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
@@ -76,7 +64,12 @@ int ConvertToI420(const uint8_t* sample,
   uint8_t* rotate_buffer = NULL;
   const int inv_crop_height =
       (src_height < 0) ? -abs_crop_height : abs_crop_height;
-  int aligned_src_width = (src_width + 1) & ~1;
+
+  if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
+      src_width > INT_MAX / 4 || crop_width <= 0 || src_height == 0 ||
+      crop_height == 0) {
+    return -1;
+  }
 
   // One pass rotation is available for some formats. For the rest, convert
   // to I420 (with optional vertical flipping) into a temporary I420 buffer,
@@ -84,14 +77,14 @@ int ConvertToI420(const uint8_t* sample,
   // For in-place conversion, if destination dst_y is same as source sample,
   // also enable temporary buffer.
   if (need_buf) {
-    size_t y_size = (size_t)crop_width * abs_crop_height;
-    size_t uv_size =
-        (size_t)((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
-    if (uv_size > SIZE_MAX / 2 || y_size > SIZE_MAX - uv_size * 2) {
+    int y_size = crop_width * abs_crop_height;
+    int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
+    const uint64_t rotate_buffer_size =
+        (uint64_t)y_size + (uint64_t)uv_size * 2;
+    if (rotate_buffer_size > SIZE_MAX) {
       return -1;  // Invalid size.
     }
-    const size_t rotate_buffer_size = y_size + uv_size * 2;
-    rotate_buffer = (uint8_t*)malloc(rotate_buffer_size);
+    rotate_buffer = (uint8_t*)malloc((size_t)rotate_buffer_size);
     if (!rotate_buffer) {
       return 1;  // Out of memory runtime error.
     }
@@ -109,7 +102,7 @@ int ConvertToI420(const uint8_t* sample,
       uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
       int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
       int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
-      src = sample + ((ptrdiff_t)aligned_src_width * crop_y + crop_x) * 2;
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
       r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
                      stride_u, v, stride_v, crop_width, inv_crop_height);
       break;
@@ -119,86 +112,84 @@ int ConvertToI420(const uint8_t* sample,
       uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
       int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
       int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
-      src = sample + ((ptrdiff_t)aligned_src_width * crop_y + crop_x) * 2;
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
       r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
                      stride_u, v, stride_v, crop_width, inv_crop_height);
       break;
     }
     case FOURCC_RGBP:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 2;
+      src = sample + (src_width * crop_y + crop_x) * 2;
       r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
                        dst_stride_u, dst_v, dst_stride_v, crop_width,
                        inv_crop_height);
       break;
     case FOURCC_RGBO:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 2;
+      src = sample + (src_width * crop_y + crop_x) * 2;
       r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
                          dst_stride_u, dst_v, dst_stride_v, crop_width,
                          inv_crop_height);
       break;
     case FOURCC_R444:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 2;
+      src = sample + (src_width * crop_y + crop_x) * 2;
       r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
                          dst_stride_u, dst_v, dst_stride_v, crop_width,
                          inv_crop_height);
       break;
     case FOURCC_24BG:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 3;
+      src = sample + (src_width * crop_y + crop_x) * 3;
       r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
                       dst_stride_u, dst_v, dst_stride_v, crop_width,
                       inv_crop_height);
       break;
     case FOURCC_RAW:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 3;
+      src = sample + (src_width * crop_y + crop_x) * 3;
       r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
                     dst_stride_u, dst_v, dst_stride_v, crop_width,
                     inv_crop_height);
       break;
     case FOURCC_ARGB:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
       r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
                      dst_stride_u, dst_v, dst_stride_v, crop_width,
                      inv_crop_height);
       break;
     case FOURCC_BGRA:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
       r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
                      dst_stride_u, dst_v, dst_stride_v, crop_width,
                      inv_crop_height);
       break;
     case FOURCC_ABGR:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
       r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
                      dst_stride_u, dst_v, dst_stride_v, crop_width,
                      inv_crop_height);
       break;
     case FOURCC_RGBA:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
       r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
                      dst_stride_u, dst_v, dst_stride_v, crop_width,
                      inv_crop_height);
       break;
     // TODO(fbarchard): Add AR30 and AB30
     case FOURCC_I400:
-      src = sample + (ptrdiff_t)src_width * crop_y + crop_x;
+      src = sample + src_width * crop_y + crop_x;
       r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
                      dst_v, dst_stride_v, crop_width, inv_crop_height);
       break;
     // Biplanar formats
     case FOURCC_NV12:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
-      src_uv = sample + ((ptrdiff_t)src_width * abs_src_height) +
-               ((ptrdiff_t)(crop_y / 2) * aligned_src_width) +
-               ((crop_x / 2) * 2);
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * abs_src_height) +
+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
       r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
                            dst_stride_y, dst_u, dst_stride_u, dst_v,
                            dst_stride_v, crop_width, inv_crop_height, rotation);
       break;
     case FOURCC_NV21:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
-      src_uv = sample + ((ptrdiff_t)src_width * abs_src_height) +
-               ((ptrdiff_t)(crop_y / 2) * aligned_src_width) +
-               ((crop_x / 2) * 2);
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * abs_src_height) +
+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
       // Call NV12 but with dst_u and dst_v parameters swapped.
       r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
                            dst_stride_y, dst_v, dst_stride_v, dst_u,
@@ -207,23 +198,21 @@ int ConvertToI420(const uint8_t* sample,
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YV12: {
-      const uint8_t* src_y = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
       const uint8_t* src_u;
       const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
       if (format == FOURCC_YV12) {
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                (ptrdiff_t)halfwidth * (crop_y / 2) + (crop_x / 2);
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)halfheight + (crop_y / 2)) +
+        src_v = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
                 (crop_x / 2);
+        src_u = sample + src_width * abs_src_height +
+                halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
       } else {
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                (ptrdiff_t)halfwidth * (crop_y / 2) + (crop_x / 2);
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)halfheight + (crop_y / 2)) +
+        src_u = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
                 (crop_x / 2);
+        src_v = sample + src_width * abs_src_height +
+                halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
       }
       r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                      dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
@@ -232,20 +221,20 @@ int ConvertToI420(const uint8_t* sample,
     }
     case FOURCC_I422:
     case FOURCC_YV16: {
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
       const uint8_t* src_u;
       const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       if (format == FOURCC_YV16) {
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                (ptrdiff_t)halfwidth * crop_y + (crop_x / 2);
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)abs_src_height + crop_y) + (crop_x / 2);
+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+                (crop_x / 2);
+        src_u = sample + src_width * abs_src_height +
+                halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
       } else {
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                (ptrdiff_t)halfwidth * crop_y + (crop_x / 2);
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)abs_src_height + crop_y) + (crop_x / 2);
+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+                (crop_x / 2);
+        src_v = sample + src_width * abs_src_height +
+                halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
       }
       r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                      dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
@@ -254,19 +243,15 @@ int ConvertToI420(const uint8_t* sample,
     }
     case FOURCC_I444:
     case FOURCC_YV24: {
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
       const uint8_t* src_u;
       const uint8_t* src_v;
       if (format == FOURCC_YV24) {
-        src_v =
-            sample + src_width * ((ptrdiff_t)abs_src_height + crop_y) + crop_x;
-        src_u = sample + src_width * ((ptrdiff_t)abs_src_height * 2 + crop_y) +
-                crop_x;
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       } else {
-        src_u =
-            sample + src_width * ((ptrdiff_t)abs_src_height + crop_y) + crop_x;
-        src_v = sample + src_width * ((ptrdiff_t)abs_src_height * 2 + crop_y) +
-                crop_x;
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       }
       r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width,
                      dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
diff --git a/source/cpu_id.cc b/source/cpu_id.cc
index 0d7ea9a95..0cc46b10a 100644
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -397,6 +397,7 @@ static SAFEBUFFERS int GetCpuFlags(void) {
   int cpu_info7[4] = {0, 0, 0, 0};
   int cpu_einfo7[4] = {0, 0, 0, 0};
   int cpu_info24[4] = {0, 0, 0, 0};
+  int cpu_info21[4] = {0, 0, 0, 0};
   int cpu_amdinfo21[4] = {0, 0, 0, 0};
   CpuId(0, 0, cpu_info0);
   CpuId(1, 0, cpu_info1);
@@ -405,6 +406,9 @@ static SAFEBUFFERS int GetCpuFlags(void) {
     CpuId(7, 1, cpu_einfo7);
     CpuId(0x80000021, 0, cpu_amdinfo21);
   }
+  if (cpu_info0[0] >= 0x21) {
+    CpuId(0x21, 0, cpu_info21);
+  }
   if (cpu_info0[0] >= 0x24) {
     CpuId(0x24, 0, cpu_info24);
   }
@@ -435,7 +439,8 @@ static SAFEBUFFERS int GetCpuFlags(void) {
                   ((cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0) |
                   ((cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0) |
                   ((cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0) |
-                  ((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0);
+                  ((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0) |
+                  ((cpu_info21[0] & 0x00800000) ? kCpuHasAVX512BMM : 0);
       if (cpu_info0[0] >= 0x24 && (cpu_einfo7[3] & 0x00080000)) {
         cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2 : 0;
       }
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 3481d643d..3b703920c 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -11,16 +11,16 @@
 #include "libyuv/planar_functions.h"
 
 #include <assert.h>
-#include <limits.h>
 #include <string.h>  // for memset()
 
-#include "libyuv/convert_from_argb.h"  // For ArgbConstants
 #include "libyuv/cpu_id.h"
 #include "libyuv/row.h"
+#include "libyuv/convert_from_argb.h"
 #include "libyuv/scale_row.h"  // for ScaleRowDown2
 
 #ifdef __cplusplus
 namespace libyuv {
+
 extern "C" {
 #endif
 
@@ -34,18 +34,17 @@ void CopyPlane(const uint8_t* src_y,
                int height) {
   int y;
   void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_y = dst_y + (ptrdiff_t)(height - 1) * dst_stride_y;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_stride_y = -dst_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -123,18 +122,17 @@ void Convert16To8Plane(const uint16_t* src_y,
   void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
                           int width) = Convert16To8Row_C;
 
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_y = dst_y + (ptrdiff_t)(height - 1) * dst_stride_y;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_stride_y = -dst_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -198,18 +196,17 @@ void Convert8To16Plane(const uint8_t* src_y,
   void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale,
                           int width) = Convert8To16Row_C;
 
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_y = dst_y + (ptrdiff_t)(height - 1) * dst_stride_y;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_stride_y = -dst_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -266,18 +263,17 @@ void Convert8To8Plane(const uint8_t* src_y,
   void (*Convert8To8Row)(const uint8_t* src_y, uint8_t* dst_y, int scale,
                          int bias, int width) = Convert8To8Row_C;
 
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_y = dst_y + (ptrdiff_t)(height - 1) * dst_stride_y;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_stride_y = -dst_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -336,16 +332,16 @@ int I422Copy(const uint8_t* src_y,
   int halfwidth = (width + 1) >> 1;
 
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
 
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -376,15 +372,15 @@ int I444Copy(const uint8_t* src_y,
              int width,
              int height) {
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -417,16 +413,16 @@ int I210Copy(const uint16_t* src_y,
   int halfwidth = (width + 1) >> 1;
 
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
 
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -458,15 +454,15 @@ int I410Copy(const uint16_t* src_y,
              int width,
              int height) {
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -488,13 +484,13 @@ int I400ToI400(const uint8_t* src_y,
                int dst_stride_y,
                int width,
                int height) {
-  if (!src_y || !dst_y || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
+    src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
   CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
@@ -517,13 +513,13 @@ int I420ToI400(const uint8_t* src_y,
   (void)src_stride_u;
   (void)src_v;
   (void)src_stride_v;
-  if (!src_y || !dst_y || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
+    src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
 
@@ -546,8 +542,7 @@ int NV12Copy(const uint8_t* src_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
 
-  if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
 
@@ -555,8 +550,8 @@ int NV12Copy(const uint8_t* src_y,
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_uv = src_uv + (ptrdiff_t)(halfheight - 1) * src_stride_uv;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
     src_stride_y = -src_stride_y;
     src_stride_uv = -src_stride_uv;
   }
@@ -596,20 +591,20 @@ void SplitUVPlane(const uint8_t* src_uv,
   int y;
   void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
                      int width) = SplitUVRow_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_u = dst_u + (ptrdiff_t)(height - 1) * dst_stride_u;
-    dst_v = dst_v + (ptrdiff_t)(height - 1) * dst_stride_v;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
     dst_stride_u = -dst_stride_u;
     dst_stride_v = -dst_stride_v;
   }
   // Coalesce rows.
   if (src_stride_uv == width * 2 && dst_stride_u == width &&
-      dst_stride_v == width && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_v == width) {
     width *= height;
     height = 1;
     src_stride_uv = dst_stride_u = dst_stride_v = 0;
@@ -630,14 +625,6 @@ void SplitUVPlane(const uint8_t* src_uv,
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    SplitUVRow = SplitUVRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      SplitUVRow = SplitUVRow_AVX512BW;
-    }
-  }
-#endif
 #if defined(HAS_SPLITUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     SplitUVRow = SplitUVRow_Any_NEON;
@@ -681,18 +668,18 @@ void MergeUVPlane(const uint8_t* src_u,
   int y;
   void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
                      uint8_t* dst_uv, int width) = MergeUVRow_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_uv = dst_uv + (ptrdiff_t)(height - 1) * dst_stride_uv;
+    dst_uv = dst_uv + (height - 1) * dst_stride_uv;
     dst_stride_uv = -dst_stride_uv;
   }
   // Coalesce rows.
   if (src_stride_u == width && src_stride_v == width &&
-      dst_stride_uv == width * 2 && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_uv == width * 2) {
     width *= height;
     height = 1;
     src_stride_u = src_stride_v = dst_stride_uv = 0;
@@ -773,20 +760,20 @@ void SplitUVPlane_16(const uint16_t* src_uv,
   void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u,
                         uint16_t* dst_v, int depth, int width) =
       SplitUVRow_16_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_u = dst_u + (ptrdiff_t)(height - 1) * dst_stride_u;
-    dst_v = dst_v + (ptrdiff_t)(height - 1) * dst_stride_v;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
     dst_stride_u = -dst_stride_u;
     dst_stride_v = -dst_stride_v;
   }
   // Coalesce rows.
   if (src_stride_uv == width * 2 && dst_stride_u == width &&
-      dst_stride_v == width && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_v == width) {
     width *= height;
     height = 1;
     src_stride_uv = dst_stride_u = dst_stride_v = 0;
@@ -833,18 +820,18 @@ void MergeUVPlane_16(const uint16_t* src_u,
       MergeUVRow_16_C;
   assert(depth >= 8);
   assert(depth <= 16);
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_uv = dst_uv + (ptrdiff_t)(height - 1) * dst_stride_uv;
+    dst_uv = dst_uv + (height - 1) * dst_stride_uv;
     dst_stride_uv = -dst_stride_uv;
   }
   // Coalesce rows.
   if (src_stride_u == width && src_stride_v == width &&
-      dst_stride_uv == width * 2 && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_uv == width * 2) {
     width *= height;
     height = 1;
     src_stride_u = src_stride_v = dst_stride_uv = 0;
@@ -893,18 +880,17 @@ void ConvertToMSBPlane_16(const uint16_t* src_y,
   int scale = 1 << (16 - depth);
   void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale,
                          int width) = MultiplyRow_16_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_y = dst_y + (ptrdiff_t)(height - 1) * dst_stride_y;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_stride_y = -dst_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -952,18 +938,17 @@ void ConvertToLSBPlane_16(const uint16_t* src_y,
   int scale = 1 << depth;
   void (*DivideRow)(const uint16_t* src_y, uint16_t* dst_y, int scale,
                     int width) = DivideRow_16_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_y = dst_y + (ptrdiff_t)(height - 1) * dst_stride_y;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_stride_y = -dst_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -1009,18 +994,17 @@ void SwapUVPlane(const uint8_t* src_uv,
   int y;
   void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
       SwapUVRow_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_uv = src_uv + (ptrdiff_t)(height - 1) * src_stride_uv;
+    src_uv = src_uv + (height - 1) * src_stride_uv;
     src_stride_uv = -src_stride_uv;
   }
   // Coalesce rows.
-  if (src_stride_uv == width * 2 && dst_stride_vu == width * 2 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_uv == width * 2 && dst_stride_vu == width * 2) {
     width *= height;
     height = 1;
     src_stride_uv = dst_stride_vu = 0;
@@ -1073,7 +1057,7 @@ int NV21ToNV12(const uint8_t* src_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
 
-  if (!src_vu || !dst_uv || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_vu || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
 
@@ -1085,7 +1069,7 @@ int NV21ToNV12(const uint8_t* src_y,
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_vu = src_vu + (ptrdiff_t)(halfheight - 1) * src_stride_vu;
+    src_vu = src_vu + (halfheight - 1) * src_stride_vu;
     src_stride_vu = -src_stride_vu;
   }
 
@@ -1095,7 +1079,7 @@ int NV21ToNV12(const uint8_t* src_y,
 }
 
 // Test if tile_height is a power of 2 (16 or 32)
-#define IS_POWEROFTWO(x) (!((x) & ((x) - 1)))
+#define IS_POWEROFTWO(x) (!((x) & ((x)-1)))
 
 // Detile a plane of data
 // tile width is 16 and assumed.
@@ -1114,7 +1098,7 @@ int DetilePlane(const uint8_t* src_y,
   int y;
   void (*DetileRow)(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst,
                     int width) = DetileRow_C;
-  if (!src_y || !dst_y || width <= 0 || height == 0 || height == INT_MIN ||
+  if (!src_y || !dst_y || width <= 0 || height == 0 ||
       !IS_POWEROFTWO(tile_height)) {
     return -1;
   }
@@ -1122,7 +1106,7 @@ int DetilePlane(const uint8_t* src_y,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_y = dst_y + (ptrdiff_t)(height - 1) * dst_stride_y;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_stride_y = -dst_stride_y;
   }
 
@@ -1171,7 +1155,7 @@ int DetilePlane_16(const uint16_t* src_y,
   int y;
   void (*DetileRow_16)(const uint16_t* src, ptrdiff_t src_tile_stride,
                        uint16_t* dst, int width) = DetileRow_16_C;
-  if (!src_y || !dst_y || width <= 0 || height == 0 || height == INT_MIN ||
+  if (!src_y || !dst_y || width <= 0 || height == 0 ||
       !IS_POWEROFTWO(tile_height)) {
     return -1;
   }
@@ -1179,7 +1163,7 @@ int DetilePlane_16(const uint16_t* src_y,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_y = dst_y + (ptrdiff_t)(height - 1) * dst_stride_y;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_stride_y = -dst_stride_y;
   }
 
@@ -1240,15 +1224,15 @@ void DetileSplitUVPlane(const uint8_t* src_uv,
   assert(tile_height > 0);
   assert(src_stride_uv > 0);
 
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_u = dst_u + (ptrdiff_t)(height - 1) * dst_stride_u;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
     dst_stride_u = -dst_stride_u;
-    dst_v = dst_v + (ptrdiff_t)(height - 1) * dst_stride_v;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
     dst_stride_v = -dst_stride_v;
   }
 
@@ -1304,13 +1288,13 @@ void DetileToYUY2(const uint8_t* src_y,
   assert(src_stride_uv > 0);
   assert(tile_height > 0);
 
-  if (width <= 0 || height == 0 || height == INT_MIN || tile_height <= 0) {
+  if (width <= 0 || height == 0 || tile_height <= 0) {
     return;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_yuy2 = dst_yuy2 + (ptrdiff_t)(height - 1) * dst_stride_yuy2;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
 
@@ -1366,23 +1350,22 @@ void SplitRGBPlane(const uint8_t* src_rgb,
   int y;
   void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
                       uint8_t* dst_b, int width) = SplitRGBRow_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_r = dst_r + (ptrdiff_t)(height - 1) * dst_stride_r;
-    dst_g = dst_g + (ptrdiff_t)(height - 1) * dst_stride_g;
-    dst_b = dst_b + (ptrdiff_t)(height - 1) * dst_stride_b;
+    dst_r = dst_r + (height - 1) * dst_stride_r;
+    dst_g = dst_g + (height - 1) * dst_stride_g;
+    dst_b = dst_b + (height - 1) * dst_stride_b;
     dst_stride_r = -dst_stride_r;
     dst_stride_g = -dst_stride_g;
     dst_stride_b = -dst_stride_b;
   }
   // Coalesce rows.
   if (src_stride_rgb == width * 3 && dst_stride_r == width &&
-      dst_stride_g == width && dst_stride_b == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_g == width && dst_stride_b == width) {
     width *= height;
     height = 1;
     src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
@@ -1450,19 +1433,19 @@ void MergeRGBPlane(const uint8_t* src_r,
   void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
                       const uint8_t* src_b, uint8_t* dst_rgb, int width) =
       MergeRGBRow_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   // Coalesce rows.
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_rgb = dst_rgb + (ptrdiff_t)(height - 1) * dst_stride_rgb;
+    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
     dst_stride_rgb = -dst_stride_rgb;
   }
   // Coalesce rows.
   if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      dst_stride_rgb == width * 3 && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_rgb == width * 3) {
     width *= height;
     height = 1;
     src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0;
@@ -1517,14 +1500,13 @@ static void SplitARGBPlaneAlpha(const uint8_t* src_argb,
                        uint8_t* dst_b, uint8_t* dst_a, int width) =
       SplitARGBRow_C;
 
-  assert(height >= 0);
+  assert(height > 0);
 
   if (width <= 0 || height == 0) {
     return;
   }
   if (src_stride_argb == width * 4 && dst_stride_r == width &&
-      dst_stride_g == width && dst_stride_b == width && dst_stride_a == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b =
@@ -1593,14 +1575,13 @@ static void SplitARGBPlaneOpaque(const uint8_t* src_argb,
   int y;
   void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
                        uint8_t* dst_b, int width) = SplitXRGBRow_C;
-  assert(height >= 0);
+  assert(height > 0);
 
   if (width <= 0 || height == 0) {
     return;
   }
   if (src_stride_argb == width * 4 && dst_stride_r == width &&
-      dst_stride_g == width && dst_stride_b == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_g == width && dst_stride_b == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
@@ -1666,16 +1647,13 @@ void SplitARGBPlane(const uint8_t* src_argb,
                     int dst_stride_a,
                     int width,
                     int height) {
-  if (width <= 0 || height == 0 || height == INT_MIN) {
-    return;
-  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_r = dst_r + (ptrdiff_t)(height - 1) * dst_stride_r;
-    dst_g = dst_g + (ptrdiff_t)(height - 1) * dst_stride_g;
-    dst_b = dst_b + (ptrdiff_t)(height - 1) * dst_stride_b;
-    dst_a = dst_a + (ptrdiff_t)(height - 1) * dst_stride_a;
+    dst_r = dst_r + (height - 1) * dst_stride_r;
+    dst_g = dst_g + (height - 1) * dst_stride_g;
+    dst_b = dst_b + (height - 1) * dst_stride_b;
+    dst_a = dst_a + (height - 1) * dst_stride_a;
     dst_stride_r = -dst_stride_r;
     dst_stride_g = -dst_stride_g;
     dst_stride_b = -dst_stride_b;
@@ -1710,14 +1688,13 @@ static void MergeARGBPlaneAlpha(const uint8_t* src_r,
                        const uint8_t* src_b, const uint8_t* src_a,
                        uint8_t* dst_argb, int width) = MergeARGBRow_C;
 
-  assert(height >= 0);
+  assert(height > 0);
 
   if (width <= 0 || height == 0) {
     return;
   }
   if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      src_stride_a == width && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+      src_stride_a == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_r = src_stride_g = src_stride_b = src_stride_a =
@@ -1779,13 +1756,13 @@ static void MergeARGBPlaneOpaque(const uint8_t* src_r,
                        const uint8_t* src_b, uint8_t* dst_argb, int width) =
       MergeXRGBRow_C;
 
-  assert(height >= 0);
+  assert(height > 0);
 
   if (width <= 0 || height == 0) {
     return;
   }
   if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      dst_stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
@@ -1842,13 +1819,10 @@ void MergeARGBPlane(const uint8_t* src_r,
                     int dst_stride_argb,
                     int width,
                     int height) {
-  if (width <= 0 || height == 0 || height == INT_MIN) {
-    return;
-  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 
@@ -1881,18 +1855,15 @@ void MergeXR30Plane(const uint16_t* src_r,
                        const uint16_t* src_b, uint8_t* dst_ar30, int depth,
                        int width) = MergeXR30Row_C;
 
-  if (width <= 0 || height == 0 || height == INT_MIN) {
-    return;
-  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_ar30 = dst_ar30 + (ptrdiff_t)(height - 1) * dst_stride_ar30;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
     dst_stride_ar30 = -dst_stride_ar30;
   }
   // Coalesce rows.
   if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      dst_stride_ar30 == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_ar30 == width * 4) {
     width *= height;
     height = 1;
     src_stride_r = src_stride_g = src_stride_b = dst_stride_ar30 = 0;
@@ -1950,14 +1921,8 @@ static void MergeAR64PlaneAlpha(const uint16_t* src_r,
                        uint16_t* dst_argb, int depth, int width) =
       MergeAR64Row_C;
 
-  assert(height >= 0);
-
-  if (width <= 0 || height == 0) {
-    return;
-  }
   if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      src_stride_a == width && dst_stride_ar64 == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+      src_stride_a == width && dst_stride_ar64 == width * 4) {
     width *= height;
     height = 1;
     src_stride_r = src_stride_g = src_stride_b = src_stride_a =
@@ -2007,14 +1972,9 @@ static void MergeAR64PlaneOpaque(const uint16_t* src_r,
                        const uint16_t* src_b, uint16_t* dst_argb, int depth,
                        int width) = MergeXR64Row_C;
 
-  assert(height >= 0);
-
-  if (width <= 0 || height == 0) {
-    return;
-  }
   // Coalesce rows.
   if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      dst_stride_ar64 == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_ar64 == width * 4) {
     width *= height;
     height = 1;
     src_stride_r = src_stride_g = src_stride_b = dst_stride_ar64 = 0;
@@ -2059,13 +2019,10 @@ void MergeAR64Plane(const uint16_t* src_r,
                     int width,
                     int height,
                     int depth) {
-  if (width <= 0 || height == 0 || height == INT_MIN) {
-    return;
-  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_ar64 = dst_ar64 + (ptrdiff_t)(height - 1) * dst_stride_ar64;
+    dst_ar64 = dst_ar64 + (height - 1) * dst_stride_ar64;
     dst_stride_ar64 = -dst_stride_ar64;
   }
 
@@ -2100,14 +2057,8 @@ static void MergeARGB16To8PlaneAlpha(const uint16_t* src_r,
                             uint8_t* dst_argb, int depth, int width) =
       MergeARGB16To8Row_C;
 
-  assert(height >= 0);
-
-  if (width <= 0 || height == 0) {
-    return;
-  }
   if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      src_stride_a == width && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+      src_stride_a == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_r = src_stride_g = src_stride_b = src_stride_a =
@@ -2157,14 +2108,9 @@ static void MergeARGB16To8PlaneOpaque(const uint16_t* src_r,
                             const uint16_t* src_b, uint8_t* dst_argb, int depth,
                             int width) = MergeXRGB16To8Row_C;
 
-  assert(height >= 0);
-
-  if (width <= 0 || height == 0) {
-    return;
-  }
   // Coalesce rows.
   if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
-      dst_stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
@@ -2209,13 +2155,10 @@ void MergeARGB16To8Plane(const uint16_t* src_r,
                          int width,
                          int height,
                          int depth) {
-  if (width <= 0 || height == 0 || height == INT_MIN) {
-    return;
-  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
 
@@ -2247,20 +2190,19 @@ int YUY2ToI422(const uint8_t* src_yuy2,
                          uint8_t* dst_v, int width) = YUY2ToUV422Row_C;
   void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
       YUY2ToYRow_C;
-  if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_yuy2 = src_yuy2 + (ptrdiff_t)(height - 1) * src_stride_yuy2;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
     src_stride_yuy2 = -src_stride_yuy2;
   }
   // Coalesce rows.
   if (src_stride_yuy2 == width * 2 && dst_stride_y == width &&
       dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
-      (ptrdiff_t)width * height <= 32768) {
+      width * height <= 32768) {
     width *= height;
     height = 1;
     src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -2344,20 +2286,19 @@ int UYVYToI422(const uint8_t* src_uyvy,
                          uint8_t* dst_v, int width) = UYVYToUV422Row_C;
   void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
       UYVYToYRow_C;
-  if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_uyvy = src_uyvy + (ptrdiff_t)(height - 1) * src_stride_uyvy;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
     src_stride_uyvy = -src_stride_uyvy;
   }
   // Coalesce rows.
   if (src_stride_uyvy == width * 2 && dst_stride_y == width &&
       dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
-      (ptrdiff_t)width * height <= 32768) {
+      width * height <= 32768) {
     width *= height;
     height = 1;
     src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -2435,18 +2376,17 @@ int YUY2ToY(const uint8_t* src_yuy2,
   int y;
   void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
       YUY2ToYRow_C;
-  if (!src_yuy2 || !dst_y || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_yuy2 || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_yuy2 = src_yuy2 + (ptrdiff_t)(height - 1) * src_stride_yuy2;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
     src_stride_yuy2 = -src_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 && dst_stride_y == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_yuy2 == width * 2 && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_yuy2 = dst_stride_y = 0;
@@ -2495,18 +2435,17 @@ int UYVYToY(const uint8_t* src_uyvy,
   int y;
   void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
       UYVYToYRow_C;
-  if (!src_uyvy || !dst_y || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_uyvy || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_uyvy = src_uyvy + (ptrdiff_t)(height - 1) * src_stride_uyvy;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
     src_stride_uyvy = -src_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_uyvy == width * 2 && dst_stride_y == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_uyvy == width * 2 && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_uyvy = dst_stride_y = 0;
@@ -2563,13 +2502,10 @@ void MirrorPlane(const uint8_t* src_y,
                  int height) {
   int y;
   void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
-    return;
-  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
+    src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
 #if defined(HAS_MIRRORROW_NEON)
@@ -2596,14 +2532,6 @@ void MirrorPlane(const uint8_t* src_y,
     }
   }
 #endif
-#if defined(HAS_MIRRORROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    MirrorRow = MirrorRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      MirrorRow = MirrorRow_AVX512BW;
-    }
-  }
-#endif
 #if defined(HAS_MIRRORROW_LSX)
   if (TestCpuFlag(kCpuHasLSX)) {
     MirrorRow = MirrorRow_Any_LSX;
@@ -2640,13 +2568,10 @@ void MirrorUVPlane(const uint8_t* src_uv,
   int y;
   void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
       MirrorUVRow_C;
-  if (width <= 0 || height == 0 || height == INT_MIN) {
-    return;
-  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_uv = src_uv + (ptrdiff_t)(height - 1) * src_stride_uv;
+    src_uv = src_uv + (height - 1) * src_stride_uv;
     src_stride_uv = -src_stride_uv;
   }
 #if defined(HAS_MIRRORUVROW_NEON)
@@ -2659,6 +2584,7 @@ void MirrorUVPlane(const uint8_t* src_uv,
 #endif
 #if defined(HAS_MIRRORUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorUVRow = MirrorUVRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       MirrorUVRow = MirrorUVRow_SSSE3;
     }
@@ -2705,13 +2631,13 @@ int I400Mirror(const uint8_t* src_y,
                int dst_stride_y,
                int width,
                int height) {
-  if (!src_y || !dst_y || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
+    src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
 
@@ -2739,7 +2665,7 @@ int I420Mirror(const uint8_t* src_y,
   int halfheight = (height + 1) >> 1;
 
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
 
@@ -2747,9 +2673,9 @@ int I420Mirror(const uint8_t* src_y,
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -2778,8 +2704,7 @@ int NV12Mirror(const uint8_t* src_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
 
-  if ((!src_y && dst_y) || !src_uv || !dst_uv || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if ((!src_y && dst_y) || !src_uv || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
 
@@ -2787,8 +2712,8 @@ int NV12Mirror(const uint8_t* src_y,
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_uv = src_uv + (ptrdiff_t)(halfheight - 1) * src_stride_uv;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
     src_stride_y = -src_stride_y;
     src_stride_uv = -src_stride_uv;
   }
@@ -2812,14 +2737,13 @@ int ARGBMirror(const uint8_t* src_argb,
   int y;
   void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
       ARGBMirrorRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
 #if defined(HAS_ARGBMIRRORROW_NEON)
@@ -2883,14 +2807,13 @@ int RGB24Mirror(const uint8_t* src_rgb24,
   int y;
   void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
       RGB24MirrorRow_C;
-  if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_rgb24 = src_rgb24 + (ptrdiff_t)(height - 1) * src_stride_rgb24;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
     src_stride_rgb24 = -src_stride_rgb24;
   }
 #if defined(HAS_RGB24MIRRORROW_NEON)
@@ -2901,11 +2824,11 @@ int RGB24Mirror(const uint8_t* src_rgb24,
     }
   }
 #endif
-#if defined(HAS_RGB24MIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGB24MirrorRow = RGB24MirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      RGB24MirrorRow = RGB24MirrorRow_AVX2;
+#if defined(HAS_RGB24MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24MirrorRow = RGB24MirrorRow_SSSE3;
     }
   }
 #endif
@@ -2932,19 +2855,18 @@ int ARGBBlend(const uint8_t* src_argb0,
   int y;
   void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
                        uint8_t* dst_argb, int width) = ARGBBlendRow_C;
-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
   if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
-      dst_stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
@@ -2994,21 +2916,19 @@ int BlendPlane(const uint8_t* src_y0,
   void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
                         const uint8_t* alpha, uint8_t* dst, int width) =
       BlendPlaneRow_C;
-  if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_y = dst_y + (ptrdiff_t)(height - 1) * dst_stride_y;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_stride_y = -dst_stride_y;
   }
 
   // Coalesce rows for Y plane.
   if (src_stride_y0 == width && src_stride_y1 == width &&
-      alpha_stride == width && dst_stride_y == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+      alpha_stride == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
@@ -3081,15 +3001,14 @@ int I420Blend(const uint8_t* src_y0,
                         uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
 
   if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
-      !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+      !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
 
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_y = dst_y + (ptrdiff_t)(height - 1) * dst_stride_y;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_stride_y = -dst_stride_y;
   }
 
@@ -3171,7 +3090,7 @@ int I420Blend(const uint8_t* src_y0,
     }
     // Subsample 2 rows of UV to half width and half height.
     ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth);
-    alpha += (ptrdiff_t)alpha_stride * 2;
+    alpha += alpha_stride * 2;
     BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth);
     BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth);
     src_u0 += src_stride_u0;
@@ -3198,19 +3117,18 @@ int ARGBMultiply(const uint8_t* src_argb0,
   int y;
   void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1,
                           uint8_t* dst, int width) = ARGBMultiplyRow_C;
-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
   if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
-      dst_stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
@@ -3284,19 +3202,18 @@ int ARGBAdd(const uint8_t* src_argb0,
   int y;
   void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst,
                      int width) = ARGBAddRow_C;
-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
   if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
-      dst_stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
@@ -3370,19 +3287,18 @@ int ARGBSubtract(const uint8_t* src_argb0,
   int y;
   void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1,
                           uint8_t* dst, int width) = ARGBSubtractRow_C;
-  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
   if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
-      dst_stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+      dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
@@ -3449,19 +3365,17 @@ int RAWToRGB24(const uint8_t* src_raw,
   int y;
   void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) =
       RAWToRGB24Row_C;
-  if (!src_raw || !dst_rgb24 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_raw = src_raw + (ptrdiff_t)(height - 1) * src_stride_raw;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
     src_stride_raw = -src_stride_raw;
   }
   // Coalesce rows.
-  if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) {
     width *= height;
     height = 1;
     src_stride_raw = dst_stride_rgb24 = 0;
@@ -3519,16 +3433,16 @@ void SetPlane(uint8_t* dst_y,
   int y;
   void (*SetRow)(uint8_t* dst, uint8_t value, int width) = SetRow_C;
 
-  if (width <= 0 || height == 0 || height == INT_MIN) {
+  if (width <= 0 || height == 0) {
     return;
   }
   if (height < 0) {
     height = -height;
-    dst_y = dst_y + (ptrdiff_t)(height - 1) * dst_stride_y;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
     dst_stride_y = -dst_stride_y;
   }
   // Coalesce rows.
-  if (dst_stride_y == width && (ptrdiff_t)width * height <= INT_MAX) {
+  if (dst_stride_y == width) {
     width *= height;
     height = 1;
     dst_stride_y = 0;
@@ -3591,9 +3505,9 @@ int I420Rect(uint8_t* dst_y,
   uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
   uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
 
-  if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 ||
-      height == INT_MIN || x < 0 || y < 0 || value_y < 0 || value_y > 255 ||
-      value_u < 0 || value_u > 255 || value_v < 0 || value_v > 255) {
+  if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||
+      y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||
+      value_v < 0 || value_v > 255) {
     return -1;
   }
 
@@ -3615,18 +3529,17 @@ int ARGBRect(uint8_t* dst_argb,
   int y;
   void (*ARGBSetRow)(uint8_t* dst_argb, uint32_t value, int width) =
       ARGBSetRow_C;
-  if (!dst_argb || width <= 0 || height == 0 || height == INT_MIN ||
-      dst_x < 0 || dst_y < 0) {
+  if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (ptrdiff_t)(height - 1) * dst_stride_argb;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
   dst_argb += dst_y * dst_stride_argb + dst_x * 4;
   // Coalesce rows.
-  if (dst_stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+  if (dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     dst_stride_argb = 0;
@@ -3685,18 +3598,16 @@ int ARGBAttenuate(const uint8_t* src_argb,
   int y;
   void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -3766,18 +3677,16 @@ int ARGBUnattenuate(const uint8_t* src_argb,
   int y;
   void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                              int width) = ARGBUnattenuateRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -3819,18 +3728,16 @@ int ARGBGrayTo(const uint8_t* src_argb,
   int y;
   void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
       ARGBGrayRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -3885,7 +3792,7 @@ int ARGBGray(uint8_t* dst_argb,
     return -1;
   }
   // Coalesce rows.
-  if (dst_stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+  if (dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     dst_stride_argb = 0;
@@ -3938,7 +3845,7 @@ int ARGBSepia(uint8_t* dst_argb,
     return -1;
   }
   // Coalesce rows.
-  if (dst_stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+  if (dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     dst_stride_argb = 0;
@@ -3990,18 +3897,16 @@ int ARGBColorMatrix(const uint8_t* src_argb,
   void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                              const int8_t* matrix_argb, int width) =
       ARGBColorMatrixRow_C;
-  if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -4090,7 +3995,7 @@ int ARGBColorTable(uint8_t* dst_argb,
     return -1;
   }
   // Coalesce rows.
-  if (dst_stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+  if (dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     dst_stride_argb = 0;
@@ -4126,7 +4031,7 @@ int RGBColorTable(uint8_t* dst_argb,
     return -1;
   }
   // Coalesce rows.
-  if (dst_stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+  if (dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     dst_stride_argb = 0;
@@ -4171,7 +4076,7 @@ int ARGBQuantize(uint8_t* dst_argb,
     return -1;
   }
   // Coalesce rows.
-  if (dst_stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+  if (dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     dst_stride_argb = 0;
@@ -4256,13 +4161,12 @@ int ARGBBlur(const uint8_t* src_argb,
   int32_t* max_cumsum_bot_row;
   int32_t* cumsum_top_row;
 
-  if (!src_argb || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   if (radius > height) {
@@ -4357,18 +4261,16 @@ int ARGBShade(const uint8_t* src_argb,
   int y;
   void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width,
                        uint32_t value) = ARGBShadeRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN || value == 0u) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
     return -1;
   }
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -4417,23 +4319,29 @@ int InterpolatePlane(const uint8_t* src0,
   void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  if (!src0 || !src1 || !dst || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst = dst + (ptrdiff_t)(height - 1) * dst_stride;
+    dst = dst + (height - 1) * dst_stride;
     dst_stride = -dst_stride;
   }
   // Coalesce rows.
-  if (src_stride0 == width && src_stride1 == width && dst_stride == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride0 == width && src_stride1 == width && dst_stride == width) {
     width *= height;
     height = 1;
     src_stride0 = src_stride1 = dst_stride = 0;
   }
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
@@ -4493,19 +4401,17 @@ int InterpolatePlane_16(const uint16_t* src0,
   void (*InterpolateRow_16)(uint16_t* dst_ptr, const uint16_t* src_ptr,
                             ptrdiff_t src_stride, int dst_width,
                             int source_y_fraction) = InterpolateRow_16_C;
-  if (!src0 || !src1 || !dst || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst = dst + (ptrdiff_t)(height - 1) * dst_stride;
+    dst = dst + (height - 1) * dst_stride;
     dst_stride = -dst_stride;
   }
   // Coalesce rows.
-  if (src_stride0 == width && src_stride1 == width && dst_stride == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride0 == width && src_stride1 == width && dst_stride == width) {
     width *= height;
     height = 1;
     src_stride0 = src_stride1 = dst_stride = 0;
@@ -4600,8 +4506,7 @@ int I420Interpolate(const uint8_t* src0_y,
   int halfheight = (height + 1) >> 1;
 
   if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v ||
-      !dst_y || !dst_u || !dst_v || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+      !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
 
@@ -4626,19 +4531,17 @@ int ARGBShuffle(const uint8_t* src_argb,
   int y;
   void (*ARGBShuffleRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                          const uint8_t* shuffler, int width) = ARGBShuffleRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -4659,14 +4562,6 @@ int ARGBShuffle(const uint8_t* src_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBSHUFFLEROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBShuffleRow = ARGBShuffleRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBShuffleRow = ARGBShuffleRow_AVX512BW;
-    }
-  }
-#endif
 #if defined(HAS_ARGBSHUFFLEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
@@ -4712,19 +4607,17 @@ int AR64Shuffle(const uint16_t* src_ar64,
   int y;
   void (*AR64ShuffleRow)(const uint8_t* src_ar64, uint8_t* dst_ar64,
                          const uint8_t* shuffler, int width) = AR64ShuffleRow_C;
-  if (!src_ar64 || !dst_ar64 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_ar64 || !dst_ar64 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_ar64 = src_ar64 + (ptrdiff_t)(height - 1) * src_stride_ar64;
+    src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
     src_stride_ar64 = -src_stride_ar64;
   }
   // Coalesce rows.
-  if (src_stride_ar64 == width * 4 && dst_stride_ar64 == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_ar64 == width * 4 && dst_stride_ar64 == width * 4) {
     width *= height;
     height = 1;
     src_stride_ar64 = dst_stride_ar64 = 0;
@@ -4746,14 +4639,6 @@ int AR64Shuffle(const uint16_t* src_ar64,
     }
   }
 #endif
-#if defined(HAS_ARGBSHUFFLEROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    AR64ShuffleRow = ARGBShuffleRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 16)) {
-      AR64ShuffleRow = ARGBShuffleRow_AVX512BW;
-    }
-  }
-#endif
 #if defined(HAS_ARGBSHUFFLEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     AR64ShuffleRow = ARGBShuffleRow_Any_NEON;
@@ -4791,13 +4676,13 @@ int GaussPlane_F32(const float* src,
                        int width) = GaussCol_F32_C;
   void (*GaussRow_F32)(const float* src, float* dst, int width) =
       GaussRow_F32_C;
-  if (!src || !dst || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src || !dst || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src = src + (ptrdiff_t)(height - 1) * src_stride;
+    src = src + (height - 1) * src_stride;
     src_stride = -src_stride;
   }
 
@@ -4860,76 +4745,83 @@ static int ARGBSobelize(const uint8_t* src_argb,
                                          uint8_t* dst,
                                          int width)) {
   int y;
-  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =
-      ARGBToYJRow_C;
+  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
+                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
   void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,
                     uint8_t* dst_sobely, int width) = SobelYRow_C;
   void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,
                     const uint8_t* src_y2, uint8_t* dst_sobely, int width) =
       SobelXRow_C;
   const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
-  if (!src_argb || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
 
-#if defined(HAS_ARGBTOYJROW_SSSE3)
+#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_SSSE3;
+      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
+#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_AVX2;
+      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_AVX512BW)
+#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
   if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToYJRow = ARGBToYJRow_Any_AVX512BW;
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
     if (IS_ALIGNED(width, 64)) {
-      ARGBToYJRow = ARGBToYJRow_AVX512BW;
+      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_NEON)
+#if defined(HAS_ARGBTOYMATRIXROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_NEON;
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_LSX)
+#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_LSX)
   if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBToYJRow = ARGBToYJRow_Any_LSX;
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_LSX;
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_LASX)
+#if defined(HAS_ARGBTOYMATRIXROW_LASX)
   if (TestCpuFlag(kCpuHasLASX)) {
-    ARGBToYJRow = ARGBToYJRow_Any_LASX;
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
     if (IS_ALIGNED(width, 32)) {
-      ARGBToYJRow = ARGBToYJRow_LASX;
+      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_RVV)
+#if defined(HAS_ARGBTOYMATRIXROW_RVV)
   if (TestCpuFlag(kCpuHasRVV)) {
-    ARGBToYJRow = ARGBToYJRow_RVV;
+    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
   }
 #endif
 
@@ -4967,10 +4859,10 @@ static int ARGBSobelize(const uint8_t* src_argb,
     uint8_t* row_y2 = row_y1 + row_size;
     if (!rows)
       return 1;
-    ARGBToYJRow(src_argb, row_y0, width);
+    ARGBToYMatrixRow(src_argb, row_y0, width, &kArgbJPEGConstants);
     row_y0[-1] = row_y0[0];
     memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
-    ARGBToYJRow(src_argb, row_y1, width);
+    ARGBToYMatrixRow(src_argb, row_y1, width, &kArgbJPEGConstants);
     row_y1[-1] = row_y1[0];
     memset(row_y1 + width, row_y1[width - 1], 16);
     memset(row_y2 + width, 0, 16);
@@ -4980,7 +4872,7 @@ static int ARGBSobelize(const uint8_t* src_argb,
       if (y < (height - 1)) {
         src_argb += src_stride_argb;
       }
-      ARGBToYJRow(src_argb, row_y2, width);
+      ARGBToYMatrixRow(src_argb, row_y2, width, &kArgbJPEGConstants);
       row_y2[-1] = row_y2[0];
       row_y2[width] = row_y2[width - 1];
 
@@ -5130,19 +5022,17 @@ int ARGBPolynomial(const uint8_t* src_argb,
   int y;
   void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                             const float* poly, int width) = ARGBPolynomialRow_C;
-  if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -5180,7 +5070,7 @@ int HalfFloatPlane(const uint16_t* src_y,
   int y;
   void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale,
                        int width) = HalfFloatRow_C;
-  if (!src_y || !dst_y || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   src_stride_y >>= 1;
@@ -5188,12 +5078,11 @@ int HalfFloatPlane(const uint16_t* src_y,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
+    src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width && dst_stride_y == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -5287,19 +5176,17 @@ int ARGBLumaColorTable(const uint8_t* src_argb,
   void (*ARGBLumaColorTableRow)(
       const uint8_t* src_argb, uint8_t* dst_argb, int width,
       const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C;
-  if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -5329,19 +5216,17 @@ int ARGBCopyAlpha(const uint8_t* src_argb,
   int y;
   void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBCopyAlphaRow_C;
-  if (!src_argb || !dst_argb || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -5379,18 +5264,17 @@ int ARGBExtractAlpha(const uint8_t* src_argb,
                      int dst_stride_a,
                      int width,
                      int height) {
-  if (!src_argb || !dst_a || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_argb || !dst_a || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb += (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb += (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 && dst_stride_a == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_argb == width * 4 && dst_stride_a == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_a = 0;
@@ -5446,18 +5330,17 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
   int y;
   void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb,
                               int width) = ARGBCopyYToAlphaRow_C;
-  if (!src_y || !dst_argb || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
+    src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width && dst_stride_argb == width * 4 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
@@ -5506,15 +5389,14 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
       YUY2ToYRow_C;
   void (*YUY2ToNVUVRow)(const uint8_t* src_yuy2, int stride_yuy2,
                         uint8_t* dst_uv, int width) = YUY2ToNVUVRow_C;
-  if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
 
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_yuy2 = src_yuy2 + (ptrdiff_t)(height - 1) * src_stride_yuy2;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
     src_stride_yuy2 = -src_stride_yuy2;
   }
 #if defined(HAS_YUY2TOYROW_SSE2)
@@ -5615,15 +5497,14 @@ int UYVYToNV12(const uint8_t* src_uyvy,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
 
-  if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
 
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_uyvy = src_uyvy + (ptrdiff_t)(height - 1) * src_stride_uyvy;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
     src_stride_uyvy = -src_stride_uyvy;
   }
 #if defined(HAS_SPLITUVROW_SSE2)
@@ -5664,6 +5545,14 @@ int UYVYToNV12(const uint8_t* src_uyvy,
   }
 #endif
 
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
@@ -5742,14 +5631,11 @@ void HalfMergeUVPlane(const uint8_t* src_u,
                          const uint8_t* src_v, int src_stride_v,
                          uint8_t* dst_uv, int width) = HalfMergeUVRow_C;
 
-  if (width <= 0 || height == 0 || height == INT_MIN) {
-    return;
-  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
   }
diff --git a/source/rotate.cc b/source/rotate.cc
index 60940f51f..d4a9fcd27 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -8,10 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/rotate.h"
-
 #include <assert.h>
-#include <limits.h>
+
+#include "libyuv/rotate.h"
 
 #include "libyuv/convert.h"
 #include "libyuv/cpu_id.h"
@@ -129,7 +128,7 @@ void RotatePlane90(const uint8_t* src,
   // Rotate by 90 is a transpose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
-  src += (ptrdiff_t)src_stride * (height - 1);
+  src += src_stride * (height - 1);
   src_stride = -src_stride;
   TransposePlane(src, src_stride, dst, dst_stride, width, height);
 }
@@ -144,7 +143,7 @@ void RotatePlane270(const uint8_t* src,
   // Rotate by 270 is a transpose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
-  dst += (ptrdiff_t)dst_stride * (width - 1);
+  dst += dst_stride * (width - 1);
   dst_stride = -dst_stride;
   TransposePlane(src, src_stride, dst, dst_stride, width, height);
 }
@@ -161,8 +160,8 @@ void RotatePlane180(const uint8_t* src,
   assert(row);
   if (!row)
     return;
-  const uint8_t* src_bot = src + (ptrdiff_t)src_stride * (height - 1);
-  uint8_t* dst_bot = dst + (ptrdiff_t)dst_stride * (height - 1);
+  const uint8_t* src_bot = src + src_stride * (height - 1);
+  uint8_t* dst_bot = dst + dst_stride * (height - 1);
   int half_height = (height + 1) >> 1;
   int y;
   void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
@@ -355,7 +354,7 @@ void SplitRotateUV90(const uint8_t* src,
                      int dst_stride_b,
                      int width,
                      int height) {
-  src += (ptrdiff_t)src_stride * (height - 1);
+  src += src_stride * (height - 1);
   src_stride = -src_stride;
 
   SplitTransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
@@ -398,14 +397,9 @@ void SplitRotateUV180(const uint8_t* src,
     MirrorSplitUVRow = MirrorSplitUVRow_NEON;
   }
 #endif
-#if defined(HAS_MIRRORSPLITUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    MirrorSplitUVRow = MirrorSplitUVRow_AVX2;
-  }
-#endif
-#if defined(HAS_MIRRORSPLITUVROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW) && IS_ALIGNED(width, 32)) {
-    MirrorSplitUVRow = MirrorSplitUVRow_AVX512BW;
+#if defined(HAS_MIRRORSPLITUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+    MirrorSplitUVRow = MirrorSplitUVRow_SSSE3;
   }
 #endif
 #if defined(HAS_MIRRORSPLITUVROW_LSX)
@@ -437,15 +431,14 @@ int SplitRotateUV(const uint8_t* src_uv,
                   int width,
                   int height,
                   enum RotationMode mode) {
-  if (!src_uv || width <= 0 || height == 0 || height == INT_MIN || !dst_u ||
-      !dst_v) {
+  if (!src_uv || width <= 0 || height == 0 || !dst_u || !dst_v) {
     return -1;
   }
 
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_uv = src_uv + (ptrdiff_t)(height - 1) * src_stride_uv;
+    src_uv = src_uv + (height - 1) * src_stride_uv;
     src_stride_uv = -src_stride_uv;
   }
 
@@ -480,14 +473,14 @@ int RotatePlane(const uint8_t* src,
                 int width,
                 int height,
                 enum RotationMode mode) {
-  if (!src || width <= 0 || height == 0 || height == INT_MIN || !dst) {
+  if (!src || width <= 0 || height == 0 || !dst) {
     return -1;
   }
 
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src = src + (ptrdiff_t)(height - 1) * src_stride;
+    src = src + (height - 1) * src_stride;
     src_stride = -src_stride;
   }
 
@@ -540,7 +533,7 @@ static void RotatePlane90_16(const uint16_t* src,
   // Rotate by 90 is a transpose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
-  src += (ptrdiff_t)src_stride * (height - 1);
+  src += src_stride * (height - 1);
   src_stride = -src_stride;
   TransposePlane_16(src, src_stride, dst, dst_stride, width, height);
 }
@@ -554,7 +547,7 @@ static void RotatePlane270_16(const uint16_t* src,
   // Rotate by 270 is a transpose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
-  dst += (ptrdiff_t)dst_stride * (width - 1);
+  dst += dst_stride * (width - 1);
   dst_stride = -dst_stride;
   TransposePlane_16(src, src_stride, dst, dst_stride, width, height);
 }
@@ -565,8 +558,8 @@ static void RotatePlane180_16(const uint16_t* src,
                               int dst_stride,
                               int width,
                               int height) {
-  const uint16_t* src_bot = src + (ptrdiff_t)src_stride * (height - 1);
-  uint16_t* dst_bot = dst + (ptrdiff_t)dst_stride * (height - 1);
+  const uint16_t* src_bot = src + src_stride * (height - 1);
+  uint16_t* dst_bot = dst + dst_stride * (height - 1);
   int half_height = (height + 1) >> 1;
   int y;
 
@@ -598,14 +591,14 @@ int RotatePlane_16(const uint16_t* src,
                    int width,
                    int height,
                    enum RotationMode mode) {
-  if (!src || width <= 0 || height == 0 || height == INT_MIN || !dst) {
+  if (!src || width <= 0 || height == 0 || !dst) {
     return -1;
   }
 
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src = src + (ptrdiff_t)(height - 1) * src_stride;
+    src = src + (height - 1) * src_stride;
     src_stride = -src_stride;
   }
 
@@ -648,7 +641,7 @@ int I420Rotate(const uint8_t* src_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   if ((!src_y && dst_y) || !src_u || !src_v || width <= 0 || height == 0 ||
-      height == INT_MIN || !dst_y || !dst_u || !dst_v) {
+      !dst_y || !dst_u || !dst_v) {
     return -1;
   }
 
@@ -656,9 +649,9 @@ int I420Rotate(const uint8_t* src_y,
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -718,16 +711,16 @@ int I422Rotate(const uint8_t* src_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   int r;
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      height == INT_MIN || !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -813,17 +806,17 @@ int I444Rotate(const uint8_t* src_y,
                int width,
                int height,
                enum RotationMode mode) {
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      height == INT_MIN || !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
     return -1;
   }
 
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -873,8 +866,8 @@ int NV12ToI420Rotate(const uint8_t* src_y,
                      enum RotationMode mode) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_uv || width <= 0 || height == 0 || height == INT_MIN ||
-      !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
+      !dst_v) {
     return -1;
   }
 
@@ -882,8 +875,8 @@ int NV12ToI420Rotate(const uint8_t* src_y,
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_uv = src_uv + (ptrdiff_t)(halfheight - 1) * src_stride_uv;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
     src_stride_y = -src_stride_y;
     src_stride_uv = -src_stride_uv;
   }
@@ -950,16 +943,16 @@ int Android420ToI420Rotate(const uint8_t* src_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -1025,16 +1018,16 @@ int I010Rotate(const uint16_t* src_y,
                enum RotationMode mode) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_stride_y < 0) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v || dst_stride_y < 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -1096,16 +1089,16 @@ int I210Rotate(const uint16_t* src_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   int r;
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      height == INT_MIN || !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
@@ -1193,16 +1186,16 @@ int I410Rotate(const uint16_t* src_y,
                int width,
                int height,
                enum RotationMode mode) {
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_stride_y < 0) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v || dst_stride_y < 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
     src_stride_y = -src_stride_y;
     src_stride_u = -src_stride_u;
     src_stride_v = -src_stride_v;
diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc
index 8cfaed034..8c76ca919 100644
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -10,8 +10,6 @@
 
 #include "libyuv/rotate_argb.h"
 
-#include <limits.h>
-
 #include "libyuv/convert.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
@@ -224,15 +222,14 @@ int ARGBRotate(const uint8_t* src_argb,
                int width,
                int height,
                enum RotationMode mode) {
-  if (!src_argb || width <= 0 || height == 0 || height == INT_MIN ||
-      !dst_argb) {
+  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
     return -1;
   }
 
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
 
diff --git a/source/rotate_common.cc b/source/rotate_common.cc
index 899405651..e0341fec4 100644
--- a/source/rotate_common.cc
+++ b/source/rotate_common.cc
@@ -191,10 +191,10 @@ void Transpose4x4_32_C(const uint8_t* src,
     ((uint32_t*)(dst3))[1] = p31;
     ((uint32_t*)(dst3))[2] = p32;
     ((uint32_t*)(dst3))[3] = p33;
-    src += (ptrdiff_t)src_stride * 4;  // advance 4 rows
-    src1 += (ptrdiff_t)src_stride * 4;
-    src2 += (ptrdiff_t)src_stride * 4;
-    src3 += (ptrdiff_t)src_stride * 4;
+    src += src_stride * 4;  // advance 4 rows
+    src1 += src_stride * 4;
+    src2 += src_stride * 4;
+    src3 += src_stride * 4;
     dst += 4 * 4;  // advance 4 columns
     dst1 += 4 * 4;
     dst2 += 4 * 4;
diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc
index de14c41b0..27bd2251b 100644
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -198,16 +198,16 @@ void Transpose4x4_32_NEON(const uint8_t* src,
       "vst1.8      {q3}, [%7]!                   \n"
       "bgt         1b                            \n"
 
-      : "+r"(src),                      // %0
-        "+r"(src1),                     // %1
-        "+r"(src2),                     // %2
-        "+r"(src3),                     // %3
-        "+r"(dst),                      // %4
-        "+r"(dst1),                     // %5
-        "+r"(dst2),                     // %6
-        "+r"(dst3),                     // %7
-        "+r"(width)                     // %8
-      : "r"((ptrdiff_t)src_stride * 4)  // %9
+      : "+r"(src),                        // %0
+        "+r"(src1),                       // %1
+        "+r"(src2),                       // %2
+        "+r"(src3),                       // %3
+        "+r"(dst),                        // %4
+        "+r"(dst1),                       // %5
+        "+r"(dst2),                       // %6
+        "+r"(dst3),                       // %7
+        "+r"(width)                       // %8
+      : "r"((ptrdiff_t)(src_stride * 4))  // %9
       : "memory", "cc", "q0", "q1", "q2", "q3");
 }
 
diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc
index 14f31d94c..e09bcb178 100644
--- a/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@@ -252,16 +252,16 @@ void Transpose4x4_32_NEON(const uint8_t* src,
       "st1         {v2.4s}, [%6], 16             \n"
       "st1         {v3.4s}, [%7], 16             \n"
       "b.gt        1b                            \n"
-      : "+r"(src),                      // %0
-        "+r"(src1),                     // %1
-        "+r"(src2),                     // %2
-        "+r"(src3),                     // %3
-        "+r"(dst),                      // %4
-        "+r"(dst1),                     // %5
-        "+r"(dst2),                     // %6
-        "+r"(dst3),                     // %7
-        "+r"(width)                     // %8
-      : "r"((ptrdiff_t)src_stride * 4)  // %9
+      : "+r"(src),                        // %0
+        "+r"(src1),                       // %1
+        "+r"(src2),                       // %2
+        "+r"(src3),                       // %3
+        "+r"(dst),                        // %4
+        "+r"(dst1),                       // %5
+        "+r"(dst2),                       // %6
+        "+r"(dst3),                       // %7
+        "+r"(width)                       // %8
+      : "r"((ptrdiff_t)(src_stride * 4))  // %9
       : "memory", "cc", "v0", "v1", "v2", "v3");
 }
 
diff --git a/source/rotate_win.cc b/source/rotate_win.cc
index 5b40f62a0..03eeee3a6 100644
--- a/source/rotate_win.cc
+++ b/source/rotate_win.cc
@@ -64,7 +64,7 @@ __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
     mov       eax, ebp
     movdqa    xmm7, xmm6
     palignr   xmm7, xmm7, 8
-     // Second round of bit swap.
+    // Second round of bit swap.
     punpcklwd xmm0, xmm2
     punpcklwd xmm1, xmm3
     movdqa    xmm2, xmm0
@@ -77,8 +77,8 @@ __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
     movdqa    xmm7, xmm5
     palignr   xmm6, xmm6, 8
     palignr   xmm7, xmm7, 8
-     // Third round of bit swap.
-     // Write to the destination pointer.
+    // Third round of bit swap.
+    // Write to the destination pointer.
     punpckldq xmm0, xmm4
     movq      qword ptr [edx], xmm0
     movdqa    xmm4, xmm0
@@ -173,7 +173,7 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
     movdqa    xmm7, xmm5
     lea       eax, [eax + 8 * edi + 16]
     neg       edi
-         // Second round of bit swap.
+        // Second round of bit swap.
     movdqa    xmm5, xmm0
     punpcklwd xmm0, xmm2
     punpckhwd xmm5, xmm2
@@ -193,8 +193,8 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
     punpckhwd xmm6, xmm7
     movdqa    xmm7, xmm6
 
-         // Third round of bit swap.
-         // Write to the destination pointer.
+        // Third round of bit swap.
+        // Write to the destination pointer.
     movdqa    xmm6, xmm0
     punpckldq xmm0, xmm4
     punpckhdq xmm6, xmm4
diff --git a/source/row_any.cc b/source/row_any.cc
index 919b231e6..4ae858560 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -10,6 +10,7 @@
 
 #include "libyuv/row.h"
 
+#include <stddef.h>
 #include <string.h>  // For memset.
 
 #include "libyuv/basic_types.h"
@@ -387,12 +388,6 @@ ANY31C(I444ToRGB24Row_Any_SSSE3, I444ToRGB24Row_SSSE3, 0, 0, 3, 15)
 #ifdef HAS_I422TORGB24ROW_AVX2
 ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
 #endif
-#ifdef HAS_I422TORGB24ROW_AVX512VBMI
-ANY31C(I422ToRGB24Row_Any_AVX512VBMI, I422ToRGB24Row_AVX512VBMI, 1, 0, 3, 31)
-#endif
-#ifdef HAS_I422TORGB24ROW_AVX512BW
-ANY31C(I422ToRGB24Row_Any_AVX512BW, I422ToRGB24Row_AVX512BW, 1, 0, 3, 31)
-#endif
 #ifdef HAS_I422TOARGBROW_AVX2
 ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
 #endif
@@ -951,7 +946,9 @@ ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
 #if defined(HAS_ARGBTORGB24ROW_SSSE3)
 ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
 ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
-
+ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
 #endif
 #if defined(HAS_ARGBTORGB24ROW_AVX2)
 ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
@@ -987,9 +984,8 @@ ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
 #if defined(HAS_ARGBTOAR30ROW_AVX2)
 ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
 #endif
-
-#if defined(HAS_J400TOARGBROW_AVX512BW)
-ANY11(J400ToARGBRow_Any_AVX512BW, J400ToARGBRow_AVX512BW, 0, 1, 4, 31)
+#if defined(HAS_J400TOARGBROW_SSE2)
+ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
 #endif
 #if defined(HAS_J400TOARGBROW_AVX2)
 ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
@@ -997,14 +993,13 @@ ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
 ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
 ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
-
+ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
 #endif
 #if defined(HAS_RAWTOARGBROW_AVX2)
 ANY11(RAWToARGBRow_Any_AVX2, RAWToARGBRow_AVX2, 0, 3, 4, 31)
 #endif
-#if defined(HAS_RGB24TOARGBROW_AVX2)
-ANY11(RGB24ToARGBRow_Any_AVX2, RGB24ToARGBRow_AVX2, 0, 3, 4, 31)
-#endif
 #if defined(HAS_RAWTOARGBROW_AVX512BW)
 ANY11(RAWToARGBRow_Any_AVX512BW, RAWToARGBRow_AVX512BW, 0, 3, 4, 63)
 #endif
@@ -1420,8 +1415,8 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
 // Any 1 to 1 with parameter.
 #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
   void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
-    SIMD_ALIGNED(uint8_t vin[(MASK + 1) * SBPP]);                              \
-    SIMD_ALIGNED(uint8_t vout[(MASK + 1) * BPP]);                              \
+    SIMD_ALIGNED(uint8_t vin[64]);                                             \
+    SIMD_ALIGNED(uint8_t vout[64]);                                            \
     memset(vin, 0, sizeof(vin)); /* for msan */                                \
     int r = width & MASK;                                                      \
     int n = width & ~MASK;                                                     \
@@ -1467,6 +1462,14 @@ ANY11P(I400ToARGBRow_Any_LSX,
        15)
 #endif
 
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
+       ARGBToRGB565DitherRow_SSE2,
+       const uint32_t,
+       4,
+       2,
+       3)
+#endif
 #if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
 ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
        ARGBToRGB565DitherRow_AVX2,
@@ -1505,14 +1508,6 @@ ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
 ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)
 #endif
-#ifdef HAS_ARGBSHUFFLEROW_AVX512BW
-ANY11P(ARGBShuffleRow_Any_AVX512BW,
-       ARGBShuffleRow_AVX512BW,
-       const uint8_t*,
-       4,
-       4,
-       31)
-#endif
 #ifdef HAS_ARGBSHUFFLEROW_NEON
 ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
 #endif
@@ -1835,9 +1830,18 @@ ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7)
     memcpy(dst_ptr + np * BPP, vout, r * BPP * sizeof(TD));          \
   }
 
-#if defined(HAS_INTERPOLATEROW_AVX2)
+#ifdef HAS_INTERPOLATEROW_AVX2
 ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31)
 #endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+ANY11I(InterpolateRow_Any_SSSE3,
+       InterpolateRow_SSSE3,
+       uint8_t,
+       uint8_t,
+       1,
+       1,
+       15)
+#endif
 #ifdef HAS_INTERPOLATEROW_NEON
 ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, uint8_t, 1, 1, 15)
 #endif
@@ -1854,15 +1858,6 @@ ANY11I(InterpolateRow_16_Any_NEON,
        1,
        7)
 #endif
-#ifdef HAS_INTERPOLATEROW_16_AVX2
-ANY11I(InterpolateRow_16_Any_AVX2,
-       InterpolateRow_16_AVX2,
-       uint16_t,
-       uint16_t,
-       1,
-       1,
-       15)
-#endif
 #undef ANY11I
 
 // Any 1 to 1 interpolate with scale param
@@ -1911,8 +1906,8 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2,
 // Any 1 to 1 mirror.
 #define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                          \
   void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
-    SIMD_ALIGNED(uint8_t vin[128]);                                   \
-    SIMD_ALIGNED(uint8_t vout[128]);                                  \
+    SIMD_ALIGNED(uint8_t vin[64]);                                    \
+    SIMD_ALIGNED(uint8_t vout[64]);                                   \
     memset(vin, 0, sizeof(vin)); /* for msan */                       \
     int r = width & MASK;                                             \
     int n = width & ~MASK;                                            \
@@ -1920,14 +1915,11 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2,
       ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                        \
     }                                                                 \
     ptrdiff_t np = n;                                                 \
-    memcpy(vin, src_ptr, r * BPP);                                    \
+    memcpy(vin, src_ptr, r* BPP);                                     \
     ANY_SIMD(vin, vout, MASK + 1);                                    \
     memcpy(dst_ptr + np * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \
   }
 
-#ifdef HAS_MIRRORROW_AVX512BW
-ANY11M(MirrorRow_Any_AVX512BW, MirrorRow_AVX512BW, 1, 63)
-#endif
 #ifdef HAS_MIRRORROW_AVX2
 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
 #endif
@@ -1946,6 +1938,9 @@ ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63)
 #ifdef HAS_MIRRORUVROW_AVX2
 ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
 #endif
+#ifdef HAS_MIRRORUVROW_SSSE3
+ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
+#endif
 #ifdef HAS_MIRRORUVROW_NEON
 ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
 #endif
@@ -1970,8 +1965,8 @@ ANY11M(ARGBMirrorRow_Any_LSX, ARGBMirrorRow_LSX, 4, 7)
 #ifdef HAS_ARGBMIRRORROW_LASX
 ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15)
 #endif
-#ifdef HAS_RGB24MIRRORROW_AVX2
-ANY11M(RGB24MirrorRow_Any_AVX2, RGB24MirrorRow_AVX2, 3, 31)
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15)
 #endif
 #ifdef HAS_RGB24MIRRORROW_NEON
 ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
@@ -2031,9 +2026,6 @@ ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3)
 #ifdef HAS_SPLITUVROW_SSE2
 ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
 #endif
-#ifdef HAS_SPLITUVROW_AVX512BW
-ANY12(SplitUVRow_Any_AVX512BW, SplitUVRow_AVX512BW, 0, 2, 0, 63)
-#endif
 #ifdef HAS_SPLITUVROW_AVX2
 ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
 #endif
@@ -2205,7 +2197,7 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
                uint8_t* dst_v, int width) {                                  \
     SIMD_ALIGNED(uint8_t vin[256 * 2]);                                      \
     SIMD_ALIGNED(uint8_t vout[256 * 2]);                                     \
-    memset(vin, 0, sizeof(vin));   /* for msan */                            \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
     memset(vout, 0, sizeof(vout)); /* for msan */                            \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
@@ -2227,29 +2219,29 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
     memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1));                         \
   }
 
-#define ANY12M(NAMEANY, ANY_SIMD, BPP, MASK)                           \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \
-               int width, const struct ArgbConstants* c) {             \
-    SIMD_ALIGNED(uint8_t vin[256]);                                    \
-    SIMD_ALIGNED(uint8_t vout[256 * 2]);                               \
-    memset(vin, 0, sizeof(vin)); /* for msan */                        \
-    int r = width & MASK;                                              \
-    int n = width & ~MASK;                                             \
-    if (n > 0) {                                                       \
-      ANY_SIMD(src_ptr, dst_u, dst_v, n, c);                           \
-    }                                                                  \
-    memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP);     \
-    ANY_SIMD(vin, vout, vout + 256, MASK + 1, c);                      \
-    memcpy(dst_u + (ptrdiff_t)n, vout, (ptrdiff_t)r);                  \
-    memcpy(dst_v + (ptrdiff_t)n, vout + 256, (ptrdiff_t)r);            \
+#define ANY12M(NAMEANY, ANY_SIMD, BPP, MASK)                                 \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v,        \
+               int width, const struct ArgbConstants* c) {                   \
+    SIMD_ALIGNED(uint8_t vin[256]);                                          \
+    SIMD_ALIGNED(uint8_t vout[256 * 2]);                                     \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, dst_u, dst_v, n, c);                                 \
+    }                                                                        \
+    memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP);           \
+    ANY_SIMD(vin, vout, vout + 256, MASK + 1, c);                            \
+    memcpy(dst_u + (ptrdiff_t)n, vout, (ptrdiff_t)r);                        \
+    memcpy(dst_v + (ptrdiff_t)n, vout + 256, (ptrdiff_t)r);                  \
   }
 
 #define ANY12MS(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                       \
-  void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u,       \
-               uint8_t* dst_v, int width, const struct ArgbConstants* c) {   \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u,        \
+               uint8_t* dst_v, int width, const struct ArgbConstants* c) {    \
     SIMD_ALIGNED(uint8_t vin[256 * 2]);                                      \
     SIMD_ALIGNED(uint8_t vout[256 * 2]);                                     \
-    memset(vin, 0, sizeof(vin));   /* for msan */                            \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
     memset(vout, 0, sizeof(vout)); /* for msan */                            \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
@@ -2277,35 +2269,12 @@ ANY12MS(ARGBToUVMatrixRow_Any_NEON, ARGBToUVMatrixRow_NEON, 0, 4, 15)
 #ifdef HAS_ARGBTOUVMATRIXROW_NEON_I8MM
 ANY12MS(ARGBToUVMatrixRow_Any_NEON_I8MM, ARGBToUVMatrixRow_NEON_I8MM, 0, 4, 15)
 #endif
-#ifdef HAS_RGBTOUVMATRIXROW_NEON
-ANY12MS(RGBToUVMatrixRow_Any_NEON, RGBToUVMatrixRow_NEON, 0, 3, 15)
-#endif
-#ifdef HAS_RGB565TOUVMATRIXROW_NEON
-ANY12MS(RGB565ToUVMatrixRow_Any_NEON, RGB565ToUVMatrixRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOUVMATRIXROW_NEON
-ANY12MS(ARGB1555ToUVMatrixRow_Any_NEON, ARGB1555ToUVMatrixRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB4444TOUVMATRIXROW_NEON
-ANY12MS(ARGB4444ToUVMatrixRow_Any_NEON, ARGB4444ToUVMatrixRow_NEON, 0, 2, 15)
-#endif
 #ifdef HAS_ARGBTOUVMATRIXROW_AVX2
-ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 31)
-ANY12MS(RGBToUVMatrixRow_Any_AVX2, RGBToUVMatrixRow_AVX2, 0, 3, 31)
-ANY12MS(RGB565ToUVMatrixRow_Any_AVX2, RGB565ToUVMatrixRow_AVX2, 0, 2, 31)
-#ifdef HAS_ARGB1555TOARGBROW_AVX2
-ANY12MS(ARGB1555ToUVMatrixRow_Any_AVX2, ARGB1555ToUVMatrixRow_AVX2, 0, 2, 31)
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_AVX2
-ANY12MS(ARGB4444ToUVMatrixRow_Any_AVX2, ARGB4444ToUVMatrixRow_AVX2, 0, 2, 31)
-#endif
+ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15)
 #endif
 #ifdef HAS_ARGBTOUVMATRIXROW_AVX512BW
 ANY12MS(ARGBToUVMatrixRow_Any_AVX512BW, ARGBToUVMatrixRow_AVX512BW, 0, 4, 63)
 #endif
-#ifdef HAS_RGBTOUVMATRIXROW_AVX512BW
-ANY12MS(RGBToUVMatrixRow_Any_AVX512BW, RGBToUVMatrixRow_AVX512BW, 0, 3, 63)
-#endif
 #ifdef HAS_ARGBTOUVMATRIXROW_SSSE3
 ANY12MS(ARGBToUVMatrixRow_Any_SSSE3, ARGBToUVMatrixRow_SSSE3, 0, 4, 7)
 #endif
@@ -2322,20 +2291,20 @@ ANY12M(ARGBToUV444MatrixRow_Any_SSSE3, ARGBToUV444MatrixRow_SSSE3, 4, 15)
 ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7)
 #endif
 
-#define ANY11MC(NAMEANY, ANY_SIMD, BPP, MASK)                       \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width, \
-               const struct ArgbConstants* c) {                     \
-    SIMD_ALIGNED(uint8_t vin[256]);                                 \
-    SIMD_ALIGNED(uint8_t vout[256]);                                \
-    memset(vin, 0, sizeof(vin)); /* for msan */                     \
-    int r = width & MASK;                                           \
-    int n = width & ~MASK;                                          \
-    if (n > 0) {                                                    \
-      ANY_SIMD(src_ptr, dst_ptr, n, c);                             \
-    }                                                               \
-    memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP);  \
-    ANY_SIMD(vin, vout, MASK + 1, c);                               \
-    memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r);             \
+#define ANY11MC(NAMEANY, ANY_SIMD, BPP, MASK)                                \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width,          \
+               const struct ArgbConstants* c) {                              \
+    SIMD_ALIGNED(uint8_t vin[256]);                                          \
+    SIMD_ALIGNED(uint8_t vout[256]);                                         \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, dst_ptr, n, c);                                      \
+    }                                                                        \
+    memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP);           \
+    ANY_SIMD(vin, vout, MASK + 1, c);                                        \
+    memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r);                      \
   }
 
 #ifdef HAS_ARGBTOYROW_SSSE3
@@ -2343,14 +2312,6 @@ ANY11MC(ARGBToYMatrixRow_Any_SSSE3, ARGBToYMatrixRow_SSSE3, 4, 15)
 #endif
 #ifdef HAS_ARGBTOYROW_AVX2
 ANY11MC(ARGBToYMatrixRow_Any_AVX2, ARGBToYMatrixRow_AVX2, 4, 31)
-ANY11MC(RGBToYMatrixRow_Any_AVX2, RGBToYMatrixRow_AVX2, 3, 31)
-ANY11MC(RGB565ToYMatrixRow_Any_AVX2, RGB565ToYMatrixRow_AVX2, 2, 31)
-#ifdef HAS_ARGB1555TOYMATRIXROW_AVX2
-ANY11MC(ARGB1555ToYMatrixRow_Any_AVX2, ARGB1555ToYMatrixRow_AVX2, 2, 31)
-#endif
-#ifdef HAS_ARGB4444TOYMATRIXROW_AVX2
-ANY11MC(ARGB4444ToYMatrixRow_Any_AVX2, ARGB4444ToYMatrixRow_AVX2, 2, 31)
-#endif
 #endif
 #ifdef HAS_ARGBTOYROW_AVX512BW
 ANY11MC(ARGBToYMatrixRow_Any_AVX512BW, ARGBToYMatrixRow_AVX512BW, 4, 63)
@@ -2361,18 +2322,6 @@ ANY11MC(ARGBToYMatrixRow_Any_NEON, ARGBToYMatrixRow_NEON, 4, 15)
 #ifdef HAS_ARGBTOYMATRIXROW_NEON_DOTPROD
 ANY11MC(ARGBToYMatrixRow_Any_NEON_DotProd, ARGBToYMatrixRow_NEON_DotProd, 4, 15)
 #endif
-#ifdef HAS_RGBTOYMATRIXROW_NEON
-ANY11MC(RGBToYMatrixRow_Any_NEON, RGBToYMatrixRow_NEON, 3, 15)
-#endif
-#ifdef HAS_RGB565TOYMATRIXROW_NEON
-ANY11MC(RGB565ToYMatrixRow_Any_NEON, RGB565ToYMatrixRow_NEON, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOYMATRIXROW_NEON
-ANY11MC(ARGB1555ToYMatrixRow_Any_NEON, ARGB1555ToYMatrixRow_NEON, 2, 15)
-#endif
-#ifdef HAS_ARGB4444TOYMATRIXROW_NEON
-ANY11MC(ARGB4444ToYMatrixRow_Any_NEON, ARGB4444ToYMatrixRow_NEON, 2, 15)
-#endif
 #ifdef HAS_ARGBTOYMATRIXROW_LSX
 ANY11MC(ARGBToYMatrixRow_Any_LSX, ARGBToYMatrixRow_LSX, 4, 15)
 #endif
diff --git a/source/row_common.cc b/source/row_common.cc
index 70ceaf5c8..b2a0ec12b 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -14,7 +14,7 @@
 #include <string.h>  // For memcpy and memset.
 
 #include "libyuv/basic_types.h"
-#include "libyuv/convert_argb.h"       // For kYuvI601Constants
+#include "libyuv/convert_argb.h"  // For kYuvI601Constants
 #include "libyuv/convert_from_argb.h"  // For ArgbConstants
 
 #ifdef __cplusplus
@@ -37,6 +37,10 @@ extern "C" {
 // LIBYUV_UNLIMITED_BT709
 // LIBYUV_UNLIMITED_BT2020
 
+#if defined(LIBYUV_BIT_EXACT)
+#define LIBYUV_UNATTENUATE_DUP 1
+#endif
+
 // llvm x86 is poor at ternary operator, so use branchless min/max.
 
 #define USE_BRANCHLESS 1
@@ -749,31 +753,28 @@ MAKEROWYJ(ABGR, 0, 1, 2, 4)
 MAKEROWYJ(RGBA, 3, 2, 1, 4)
 #undef MAKEROWYJ
 
-static __inline uint8_t RGBToYMatrix(uint8_t b0,
-                                     uint8_t b1,
-                                     uint8_t b2,
-                                     uint8_t b3,
+static __inline uint8_t RGBToYMatrix(uint8_t r,
+                                     uint8_t g,
+                                     uint8_t b,
                                      const struct ArgbConstants* c) {
-  return (c->kRGBToY[0] * b0 + c->kRGBToY[1] * b1 + c->kRGBToY[2] * b2 +
-          c->kRGBToY[3] * b3 + c->kAddY[0]) >>
+  return (c->kRGBToY[2] * r + c->kRGBToY[1] * g + c->kRGBToY[0] * b +
+          c->kAddY[0]) >>
          8;
 }
-static __inline uint8_t RGBToUMatrix(uint8_t b0,
-                                     uint8_t b1,
-                                     uint8_t b2,
-                                     uint8_t b3,
+static __inline uint8_t RGBToUMatrix(uint8_t r,
+                                     uint8_t g,
+                                     uint8_t b,
                                      const struct ArgbConstants* c) {
-  return (c->kAddUV[0] - (c->kRGBToU[0] * b0 + c->kRGBToU[1] * b1 +
-                          c->kRGBToU[2] * b2 + c->kRGBToU[3] * b3)) >>
+  return (c->kAddUV[0] -
+          (c->kRGBToU[2] * r + c->kRGBToU[1] * g + c->kRGBToU[0] * b)) >>
          8;
 }
-static __inline uint8_t RGBToVMatrix(uint8_t b0,
-                                     uint8_t b1,
-                                     uint8_t b2,
-                                     uint8_t b3,
+static __inline uint8_t RGBToVMatrix(uint8_t r,
+                                     uint8_t g,
+                                     uint8_t b,
                                      const struct ArgbConstants* c) {
-  return (c->kAddUV[0] - (c->kRGBToV[0] * b0 + c->kRGBToV[1] * b1 +
-                          c->kRGBToV[2] * b2 + c->kRGBToV[3] * b3)) >>
+  return (c->kAddUV[0] -
+          (c->kRGBToV[2] * r + c->kRGBToV[1] * g + c->kRGBToV[0] * b)) >>
          8;
 }
 
@@ -783,8 +784,7 @@ void ARGBToYMatrixRow_C(const uint8_t* src_argb,
                         const struct ArgbConstants* c) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_y[0] =
-        RGBToYMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
+    dst_y[0] = RGBToYMatrix(src_argb[2], src_argb[1], src_argb[0], c);
     src_argb += 4;
     dst_y += 1;
   }
@@ -799,28 +799,25 @@ void ARGBToUVMatrixRow_C(const uint8_t* src_argb,
   const uint8_t* src_argb1 = src_argb + src_stride_argb;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8_t b0 =
+    uint8_t ab =
         (src_argb[0] + src_argb[4] + src_argb1[0] + src_argb1[4] + 2) >> 2;
-    uint8_t b1 =
+    uint8_t ag =
         (src_argb[1] + src_argb[5] + src_argb1[1] + src_argb1[5] + 2) >> 2;
-    uint8_t b2 =
+    uint8_t ar =
         (src_argb[2] + src_argb[6] + src_argb1[2] + src_argb1[6] + 2) >> 2;
-    uint8_t b3 =
-        (src_argb[3] + src_argb[7] + src_argb1[3] + src_argb1[7] + 2) >> 2;
-    dst_u[0] = RGBToUMatrix(b0, b1, b2, b3, c);
-    dst_v[0] = RGBToVMatrix(b0, b1, b2, b3, c);
+    dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
+    dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
     src_argb += 8;
     src_argb1 += 8;
     dst_u += 1;
     dst_v += 1;
   }
   if (width & 1) {
-    uint8_t b0 = (src_argb[0] + src_argb1[0] + 1) >> 1;
-    uint8_t b1 = (src_argb[1] + src_argb1[1] + 1) >> 1;
-    uint8_t b2 = (src_argb[2] + src_argb1[2] + 1) >> 1;
-    uint8_t b3 = (src_argb[3] + src_argb1[3] + 1) >> 1;
-    dst_u[0] = RGBToUMatrix(b0, b1, b2, b3, c);
-    dst_v[0] = RGBToVMatrix(b0, b1, b2, b3, c);
+    uint8_t ab = (src_argb[0] + src_argb1[0] + 1) >> 1;
+    uint8_t ag = (src_argb[1] + src_argb1[1] + 1) >> 1;
+    uint8_t ar = (src_argb[2] + src_argb1[2] + 1) >> 1;
+    dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
+    dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
   }
 }
 
@@ -831,10 +828,11 @@ void ARGBToUV444MatrixRow_C(const uint8_t* src_argb,
                             const struct ArgbConstants* c) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_u[0] =
-        RGBToUMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
-    dst_v[0] =
-        RGBToVMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
+    uint8_t ab = src_argb[0];
+    uint8_t ag = src_argb[1];
+    uint8_t ar = src_argb[2];
+    dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
+    dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
     src_argb += 4;
     dst_u += 1;
     dst_v += 1;
@@ -1514,18 +1512,18 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
       YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
 
-#define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV) \
-  extern const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) =   \
-      ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV), \
-                        -(RV), 0, AY, AUV);                                  \
-  extern const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) =   \
-      ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV), \
-                        -(BV), 0, AY, AUV);                                  \
-  extern const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) =   \
-      ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV),     \
-                        -(GV), -(RV), AY, AUV);                              \
-  extern const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) =   \
-      ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV),     \
+#define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV)   \
+  const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) =            \
+      ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV),   \
+                        -(RV), 0, AY, AUV);                                    \
+  const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) =            \
+      ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV),   \
+                        -(BV), 0, AY, AUV);                                    \
+  const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) =            \
+      ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV),       \
+                        -(GV), -(RV), AY, AUV);                                \
+  const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) =            \
+      ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV),       \
                         -(GV), -(BV), AY, AUV);
 
 // BT.601 limited range RGB to YUV coefficients
@@ -3468,7 +3466,7 @@ void ARGBBlendRow_C(const uint8_t* src_argb,
 }
 #undef BLEND
 
-#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8
+#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
 void BlendPlaneRow_C(const uint8_t* src0,
                      const uint8_t* src1,
                      const uint8_t* alpha,
@@ -3575,8 +3573,12 @@ const uint32_t fixed_invtbl8[256] = {
     T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};
 #undef T
 
+#if defined(LIBYUV_UNATTENUATE_DUP)
 // This code mimics the Intel SIMD version for better testability.
 #define UNATTENUATE(f, ia) clamp255(((f | (f << 8)) * ia) >> 16)
+#else
+#define UNATTENUATE(f, ia) clamp255((f * ia) >> 8)
+#endif
 
 // mimics the Intel SIMD code for exactness.
 void ARGBUnattenuateRow_C(const uint8_t* src_argb,
@@ -3664,8 +3666,7 @@ void ARGBAffineRow_C(const uint8_t* src_argb,
     int x = (int)(uv[0]);
     int y = (int)(uv[1]);
     *(uint32_t*)(dst_argb) =
-        *(const uint32_t*)(src_argb + (ptrdiff_t)y * src_argb_stride +
-                           (ptrdiff_t)x * 4);
+        *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4);
     dst_argb += 4;
     uv[0] += uv_dudv[2];
     uv[1] += uv_dudv[3];
@@ -4171,7 +4172,7 @@ void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
 }
 #endif
 
-#if defined(HAS_NV12TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2)
+#if defined(HAS_NV12TORGB24ROW_AVX2)
 void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
                          const uint8_t* src_uv,
                          uint8_t* dst_rgb24,
@@ -4182,7 +4183,11 @@ void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
     ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
     src_y += twidth;
     src_uv += twidth;
     dst_rgb24 += twidth * 3;
@@ -4191,7 +4196,7 @@ void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
 }
 #endif
 
-#if defined(HAS_NV21TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2)
+#if defined(HAS_NV21TORGB24ROW_AVX2)
 void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
                          const uint8_t* src_vu,
                          uint8_t* dst_rgb24,
@@ -4202,7 +4207,11 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
     ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
     src_y += twidth;
     src_vu += twidth;
     dst_rgb24 += twidth * 3;
@@ -4211,7 +4220,7 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
 }
 #endif
 
-#if defined(HAS_I422TOARGBROW_AVX2) && defined(HAS_ARGBTORGB565ROW_AVX2)
+#if defined(HAS_I422TORGB565ROW_AVX2)
 void I422ToRGB565Row_AVX2(const uint8_t* src_y,
                           const uint8_t* src_u,
                           const uint8_t* src_v,
@@ -4222,7 +4231,11 @@ void I422ToRGB565Row_AVX2(const uint8_t* src_y,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
     ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
     src_y += twidth;
     src_u += twidth / 2;
     src_v += twidth / 2;
@@ -4232,7 +4245,7 @@ void I422ToRGB565Row_AVX2(const uint8_t* src_y,
 }
 #endif
 
-#if defined(HAS_I422TOARGBROW_AVX2) && defined(HAS_ARGBTOARGB1555ROW_AVX2)
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
 void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -4244,7 +4257,11 @@ void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
     ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+#else
+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+#endif
     src_y += twidth;
     src_u += twidth / 2;
     src_v += twidth / 2;
@@ -4254,7 +4271,7 @@ void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
 }
 #endif
 
-#if defined(HAS_I422TOARGBROW_AVX2) && defined(HAS_ARGBTOARGB4444ROW_AVX2)
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
 void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@@ -4266,7 +4283,11 @@ void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
     ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+#else
+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+#endif
     src_y += twidth;
     src_u += twidth / 2;
     src_v += twidth / 2;
@@ -4276,7 +4297,7 @@ void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
 }
 #endif
 
-#if defined(HAS_I422TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2)
+#if defined(HAS_I422TORGB24ROW_AVX2)
 void I422ToRGB24Row_AVX2(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
@@ -4288,7 +4309,11 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
     ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
     src_y += twidth;
     src_u += twidth / 2;
     src_v += twidth / 2;
@@ -4298,51 +4323,7 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
 }
 #endif
 
-#if defined(HAS_I422TOARGBROW_AVX512BW) && defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
-void I422ToRGB24Row_AVX512VBMI(const uint8_t* src_y,
-                               const uint8_t* src_u,
-                               const uint8_t* src_v,
-                               uint8_t* dst_rgb24,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX512BW(src_y, src_u, src_v, row, yuvconstants, twidth);
-    ARGBToRGB24Row_AVX512VBMI(row, dst_rgb24, twidth);
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_rgb24 += twidth * 3;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I422TOARGBROW_AVX512BW) && defined(HAS_ARGBTORGB24ROW_AVX2)
-void I422ToRGB24Row_AVX512BW(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             uint8_t* dst_rgb24,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX512BW(src_y, src_u, src_v, row, yuvconstants, twidth);
-    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_rgb24 += twidth * 3;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I444TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2)
+#if defined(HAS_I444TORGB24ROW_AVX2)
 void I444ToRGB24Row_AVX2(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
@@ -4354,7 +4335,11 @@ void I444ToRGB24Row_AVX2(const uint8_t* src_y,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I444ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
     ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
     src_y += twidth;
     src_u += twidth;
     src_v += twidth;
@@ -4364,7 +4349,7 @@ void I444ToRGB24Row_AVX2(const uint8_t* src_y,
 }
 #endif
 
-#if defined(HAS_NV12TOARGBROW_AVX2) && defined(HAS_ARGBTORGB565ROW_AVX2)
+#if defined(HAS_NV12TORGB565ROW_AVX2)
 void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
                           const uint8_t* src_uv,
                           uint8_t* dst_rgb565,
@@ -4375,7 +4360,11 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
     ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
     src_y += twidth;
     src_uv += twidth;
     dst_rgb565 += twidth * 2;
@@ -4384,6 +4373,26 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
 }
 #endif
 
+#ifdef HAS_RGB24TOYJROW_AVX2
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+}
+#endif  // HAS_RGB24TOYJROW_AVX2
+
+#ifdef HAS_RAWTOYJROW_AVX2
+// Convert 32 RAW pixels (128 bytes) to 32 YJ values.
+}
+#endif  // HAS_RAWTOYJROW_AVX2
+
+#ifdef HAS_RGB24TOYJROW_SSSE3
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+}
+#endif  // HAS_RGB24TOYJROW_SSSE3
+
+#ifdef HAS_RAWTOYJROW_SSSE3
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+}
+#endif  // HAS_RAWTOYJROW_SSSE3
+
 #ifdef HAS_INTERPOLATEROW_16TO8_AVX2
 void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
                                const uint16_t* src_ptr,
@@ -4395,7 +4404,7 @@ void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
   SIMD_ALIGNED(uint16_t row[MAXTWIDTH]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    InterpolateRow_16_AVX2(row, src_ptr, src_stride, twidth, source_y_fraction);
+    InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction);
     Convert16To8Row_AVX2(row, dst_ptr, scale, twidth);
     src_ptr += twidth;
     dst_ptr += twidth;
@@ -4601,465 +4610,6 @@ void HalfMergeUVRow_C(const uint8_t* src_u,
 
 #undef STATIC_CAST
 
-void RGBToYMatrixRow_C(const uint8_t* src_rgb,
-                       uint8_t* dst_y,
-                       int width,
-                       const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB24ToARGBRow_C(src_rgb, row, twidth);
-    ARGBToYMatrixRow_C(row, dst_y, twidth, c);
-    src_rgb += twidth * 3;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-
-void RGBToUVMatrixRow_C(const uint8_t* src_rgb,
-                        int src_stride_rgb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width,
-                        const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB24ToARGBRow_C(src_rgb, row, twidth);
-    RGB24ToARGBRow_C(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4, twidth);
-    ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_rgb += twidth * 3;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2) && defined(HAS_RGB24TOARGBROW_AVX2)
-void RGBToYMatrixRow_AVX2(const uint8_t* src_rgb,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB24ToARGBRow_AVX2(src_rgb, row, twidth);
-    ARGBToYMatrixRow_AVX2(row, dst_y, twidth, c);
-    src_rgb += twidth * 3;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_AVX2) && defined(HAS_RGB24TOARGBROW_AVX2)
-void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb,
-                           int src_stride_rgb,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width,
-                           const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB24ToARGBRow_AVX2(src_rgb, row, twidth);
-    RGB24ToARGBRow_AVX2(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4, twidth);
-    ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_rgb += twidth * 3;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW) && \
-    defined(HAS_RGB24TOARGBROW_AVX512BW)
-void RGBToUVMatrixRow_AVX512BW(const uint8_t* src_rgb,
-                               int src_stride_rgb,
-                               uint8_t* dst_u,
-                               uint8_t* dst_v,
-                               int width,
-                               const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB24ToARGBRow_AVX512BW(src_rgb, row, twidth);
-    RGB24ToARGBRow_AVX512BW(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4,
-                            twidth);
-    ARGBToUVMatrixRow_AVX512BW(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_rgb += twidth * 3;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON) && defined(HAS_RGB24TOARGBROW_NEON)
-void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb,
-                           int src_stride_rgb,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width,
-                           const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB24ToARGBRow_NEON(src_rgb, row, twidth);
-    RGB24ToARGBRow_NEON(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4, twidth);
-    ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_rgb += twidth * 3;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
-void RGB565ToYMatrixRow_C(const uint8_t* src_rgb565,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB565ToARGBRow_C(src_rgb565, row, twidth);
-    ARGBToYMatrixRow_C(row, dst_y, twidth, c);
-    src_rgb565 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-
-void RGB565ToUVMatrixRow_C(const uint8_t* src_rgb565,
-                           int src_stride_rgb565,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width,
-                           const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB565ToARGBRow_C(src_rgb565, row, twidth);
-    RGB565ToARGBRow_C(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4,
-                      twidth);
-    ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_rgb565 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2) && defined(HAS_RGB565TOARGBROW_AVX2)
-void RGB565ToYMatrixRow_AVX2(const uint8_t* src_rgb565,
-                             uint8_t* dst_y,
-                             int width,
-                             const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB565ToARGBRow_AVX2(src_rgb565, row, twidth);
-    ARGBToYMatrixRow_AVX2(row, dst_y, twidth, c);
-    src_rgb565 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_AVX2) && defined(HAS_RGB565TOARGBROW_AVX2)
-void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565,
-                              int src_stride_rgb565,
-                              uint8_t* dst_u,
-                              uint8_t* dst_v,
-                              int width,
-                              const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB565ToARGBRow_AVX2(src_rgb565, row, twidth);
-    RGB565ToARGBRow_AVX2(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4,
-                         twidth);
-    ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_rgb565 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_RGB565TOARGBROW_NEON) && defined(HAS_ARGBTOYMATRIXROW_NEON)
-void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565,
-                             uint8_t* dst_y,
-                             int width,
-                             const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB565ToARGBRow_NEON(src_rgb565, row, twidth);
-    ARGBToYMatrixRow_NEON(row, dst_y, twidth, c);
-    src_rgb565 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_RGB565TOARGBROW_NEON) && defined(HAS_ARGBTOUVMATRIXROW_NEON)
-void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565,
-                              int src_stride_rgb565,
-                              uint8_t* dst_u,
-                              uint8_t* dst_v,
-                              int width,
-                              const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB565ToARGBRow_NEON(src_rgb565, row, twidth);
-    RGB565ToARGBRow_NEON(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4,
-                         twidth);
-    ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_rgb565 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
-void ARGB1555ToYMatrixRow_C(const uint8_t* src_argb1555,
-                            uint8_t* dst_y,
-                            int width,
-                            const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB1555ToARGBRow_C(src_argb1555, row, twidth);
-    ARGBToYMatrixRow_C(row, dst_y, twidth, c);
-    src_argb1555 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-
-void ARGB1555ToUVMatrixRow_C(const uint8_t* src_argb1555,
-                             int src_stride_argb1555,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width,
-                             const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB1555ToARGBRow_C(src_argb1555, row, twidth);
-    ARGB1555ToARGBRow_C(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4,
-                        twidth);
-    ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_argb1555 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-
-void ARGB4444ToYMatrixRow_C(const uint8_t* src_argb4444,
-                            uint8_t* dst_y,
-                            int width,
-                            const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB4444ToARGBRow_C(src_argb4444, row, twidth);
-    ARGBToYMatrixRow_C(row, dst_y, twidth, c);
-    src_argb4444 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-
-void ARGB4444ToUVMatrixRow_C(const uint8_t* src_argb4444,
-                             int src_stride_argb4444,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width,
-                             const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB4444ToARGBRow_C(src_argb4444, row, twidth);
-    ARGB4444ToARGBRow_C(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4,
-                        twidth);
-    ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_argb4444 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-void ARGB1555ToYMatrixRow_AVX2(const uint8_t* src_argb1555,
-                               uint8_t* dst_y,
-                               int width,
-                               const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB1555ToARGBRow_AVX2(src_argb1555, row, twidth);
-    ARGBToYMatrixRow_AVX2(row, dst_y, twidth, c);
-    src_argb1555 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-void ARGB4444ToYMatrixRow_AVX2(const uint8_t* src_argb4444,
-                               uint8_t* dst_y,
-                               int width,
-                               const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB4444ToARGBRow_AVX2(src_argb4444, row, twidth);
-    ARGBToYMatrixRow_AVX2(row, dst_y, twidth, c);
-    src_argb4444 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-#endif
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-void ARGB1555ToUVMatrixRow_AVX2(const uint8_t* src_argb1555,
-                                int src_stride_argb1555,
-                                uint8_t* dst_u,
-                                uint8_t* dst_v,
-                                int width,
-                                const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB1555ToARGBRow_AVX2(src_argb1555, row, twidth);
-    ARGB1555ToARGBRow_AVX2(src_argb1555 + src_stride_argb1555,
-                           row + MAXTWIDTH * 4, twidth);
-    ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_argb1555 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-void ARGB4444ToUVMatrixRow_AVX2(const uint8_t* src_argb4444,
-                                int src_stride_argb4444,
-                                uint8_t* dst_u,
-                                uint8_t* dst_v,
-                                int width,
-                                const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB4444ToARGBRow_AVX2(src_argb4444, row, twidth);
-    ARGB4444ToARGBRow_AVX2(src_argb4444 + src_stride_argb4444,
-                           row + MAXTWIDTH * 4, twidth);
-    ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_argb4444 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-#endif
-
-#if defined(HAS_ARGBTOYMATRIXROW_NEON) && defined(HAS_ARGB1555TOARGBROW_NEON)
-void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555,
-                               uint8_t* dst_y,
-                               int width,
-                               const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB1555ToARGBRow_NEON(src_argb1555, row, twidth);
-    ARGBToYMatrixRow_NEON(row, dst_y, twidth, c);
-    src_argb1555 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGBTOYMATRIXROW_NEON) && defined(HAS_ARGB4444TOARGBROW_NEON)
-void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444,
-                               uint8_t* dst_y,
-                               int width,
-                               const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB4444ToARGBRow_NEON(src_argb4444, row, twidth);
-    ARGBToYMatrixRow_NEON(row, dst_y, twidth, c);
-    src_argb4444 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON) && defined(HAS_ARGB1555TOARGBROW_NEON)
-void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555,
-                                int src_stride_argb1555,
-                                uint8_t* dst_u,
-                                uint8_t* dst_v,
-                                int width,
-                                const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB1555ToARGBRow_NEON(src_argb1555, row, twidth);
-    ARGB1555ToARGBRow_NEON(src_argb1555 + src_stride_argb1555,
-                           row + MAXTWIDTH * 4, twidth);
-    ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_argb1555 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON) && defined(HAS_ARGB4444TOARGBROW_NEON)
-void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444,
-                                int src_stride_argb4444,
-                                uint8_t* dst_u,
-                                uint8_t* dst_v,
-                                int width,
-                                const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB4444ToARGBRow_NEON(src_argb4444, row, twidth);
-    ARGB4444ToARGBRow_NEON(src_argb4444 + src_stride_argb4444,
-                           row + MAXTWIDTH * 4, twidth);
-    ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_argb4444 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 10ecf5910..767dc8605 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/convert_from_argb.h"  // For ArgbConstants
 #include "libyuv/row.h"
+#include "libyuv/convert_from_argb.h"  // For ArgbConstants
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -21,10 +21,6 @@ extern "C" {
     (defined(__x86_64__) || defined(__i386__)) && \
     !defined(LIBYUV_ENABLE_ROWWIN)
 
-// Note: for avx and avx512 declare clobber as xmm registers due to
-// clang for windows needing to preserve xmm registers but not saving
-// them if declared as ymm or zmm.
-
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 
 // Constants for ARGB
@@ -33,6 +29,7 @@ extern "C" {
 static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
                                 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
 
+
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
@@ -51,10 +48,8 @@ static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
 #ifdef HAS_RGB24TOARGBROW_SSSE3
 
 // Shuffle table for converting RGB24 to ARGB.
-static const uvec8 kShuffleMaskRGB24ToARGB[2] = {
-    {0u, 1u, 2u, 128u, 3u, 4u, 5u, 128u, 6u, 7u, 8u, 128u, 9u, 10u, 11u, 128u},
-    {4u, 5u, 6u, 128u, 7u, 8u, 9u, 128u, 10u, 11u, 12u, 128u, 13u, 14u, 15u,
-     128u}};
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
 
 // Shuffle table for converting RAW to ARGB.
 static const uvec8 kShuffleMaskRAWToARGB = {
@@ -118,76 +113,34 @@ static const lvec8 kShuffleNV21 = {
 };
 #endif  // HAS_RGB24TOARGBROW_SSSE3
 
-#if defined(HAS_J400TOARGBROW_AVX2) || defined(HAS_J400TOARGBROW_AVX512BW)
-alignas(64) static const uint8_t kShuffleMaskJ400ToARGB[64] = {
-    0u,  0u,   0u,  128u, 1u,  1u,   1u,  128u, 2u,  2u,   2u,  128u, 3u,  3u,
-    3u,  128u, 4u,  4u,   4u,  128u, 5u,  5u,   5u,  128u, 6u,  6u,   6u,  128u,
-    7u,  7u,   7u,  128u, 8u,  8u,   8u,  128u, 9u,  9u,   9u,  128u, 10u, 10u,
-    10u, 128u, 11u, 11u,  11u, 128u, 12u, 12u,  12u, 128u, 13u, 13u,  13u, 128u,
-    14u, 14u,  14u, 128u, 15u, 15u,  15u, 128u};
-#endif
-
-#ifdef HAS_J400TOARGBROW_AVX2
-void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+#ifdef HAS_J400TOARGBROW_SSE2
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   asm volatile(
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"
-      "vpslld      $0x18,%%ymm7,%%ymm7           \n"
-      "vmovdqa     (%3),%%ymm5                     \n"
-      "vmovdqa     0x20(%3),%%ymm6                  \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0x18,%%xmm5                  \n"
 
       LABELALIGN
       "1:          \n"
-      "vbroadcasti128 (%0),%%ymm0                \n"
-      "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"
-      "vpshufb     %%ymm6,%%ymm0,%%ymm2          \n"
-      "vpor        %%ymm7,%%ymm1,%%ymm1          \n"
-      "vpor        %%ymm7,%%ymm2,%%ymm2          \n"
-      "vmovdqu     %%ymm1,(%1)                   \n"
-      "vmovdqu     %%ymm2,0x20(%1)               \n"
-      "lea         0x10(%0),%0                   \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
+      "movq        (%0),%%xmm0                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklwd   %%xmm0,%%xmm0                 \n"
+      "punpckhwd   %%xmm1,%%xmm1                 \n"
+      "por         %%xmm5,%%xmm0                 \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
-      "vzeroupper  \n"
-      : "+r"(src_y),                 // %0
-        "+r"(dst_argb),              // %1
-        "+r"(width)                  // %2
-      : "r"(kShuffleMaskJ400ToARGB)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
 }
-#endif  // HAS_J400TOARGBROW_AVX2
-
-#ifdef HAS_J400TOARGBROW_AVX512BW
-void J400ToARGBRow_AVX512BW(const uint8_t* src_y,
-                            uint8_t* dst_argb,
-                            int width) {
-  asm volatile(
-      "vpternlogd  $0xff,%%zmm7,%%zmm7,%%zmm7    \n"  // 0xffffffff
-      "vpslld      $0x18,%%zmm7,%%zmm7           \n"  // 0xff000000
-      "vmovdqa64   %3,%%zmm5                     \n"
-
-      LABELALIGN
-      "1:          \n"
-      "vbroadcasti32x4 (%0),%%zmm0               \n"
-      "vbroadcasti32x4 0x10(%0),%%zmm1          \n"
-      "vpshufb     %%zmm5,%%zmm0,%%zmm0          \n"
-      "vpshufb     %%zmm5,%%zmm1,%%zmm1          \n"
-      "vpord       %%zmm7,%%zmm0,%%zmm0          \n"
-      "vpord       %%zmm7,%%zmm1,%%zmm1          \n"
-      "vmovdqu64   %%zmm0,(%1)                   \n"
-      "vmovdqu64   %%zmm1,0x40(%1)               \n"
-      "lea         0x20(%0),%0                   \n"
-      "lea         0x80(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
-      : "+r"(src_y),                 // %0
-        "+r"(dst_argb),              // %1
-        "+r"(width)                  // %2
-      : "m"(kShuffleMaskJ400ToARGB)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm5", "xmm7");
-}
-#endif  // HAS_J400TOARGBROW_AVX512BW
+#endif  // HAS_J400TOARGBROW_SSE2
 
 #ifdef HAS_RGB24TOARGBROW_SSSE3
 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
@@ -223,62 +176,13 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
       "lea         0x40(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-      : "+r"(src_rgb24),                 // %0
-        "+r"(dst_argb),                  // %1
-        "+r"(width)                      // %2
-      : "m"(kShuffleMaskRGB24ToARGB[0])  // %3
+      : "+r"(src_rgb24),              // %0
+        "+r"(dst_argb),               // %1
+        "+r"(width)                   // %2
+      : "m"(kShuffleMaskRGB24ToARGB)  // %3
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-#ifdef HAS_RGB24TOARGBROW_AVX2
-void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24,
-                         uint8_t* dst_argb,
-                         int width) {
-  // Reference to prevent discarding of kShuffleMaskRGB24ToARGB[1] which is
-  // accessed via offset in assembly.
-  const uvec8* dummy = &kShuffleMaskRGB24ToARGB[1];
-  (void)dummy;
-  asm volatile(
-      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 0xff000000
-      "vpslld      $0x18,%%ymm6,%%ymm6           \n"
-      "vbroadcasti128 %3,%%ymm4                  \n"
-      "vbroadcasti128 16+%3,%%ymm5               \n"
-
-      LABELALIGN
-      "1:          \n"
-      "vmovdqu     (%0),%%xmm0                   \n"  // first 12
-      "vinserti128 $1,12(%0),%%ymm0,%%ymm0       \n"  // second 12
-      "vmovdqu     24(%0),%%xmm1                 \n"  // third 12
-      "vinserti128 $1,36(%0),%%ymm1,%%ymm1       \n"  // forth 12
-      "vmovdqu     48(%0),%%xmm2                 \n"  // fifth 12
-      "vinserti128 $1,60(%0),%%ymm2,%%ymm2       \n"  // sixth 12
-      "vmovdqu     68(%0),%%xmm3                 \n"  // seventh 12
-      "vinserti128 $1,80(%0),%%ymm3,%%ymm3       \n"  // eighth 12
-      "lea         96(%0),%0                     \n"
-      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
-      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
-      "vpshufb     %%ymm4,%%ymm2,%%ymm2          \n"
-      "vpshufb     %%ymm5,%%ymm3,%%ymm3          \n"
-      "vpor        %%ymm6,%%ymm0,%%ymm0          \n"
-      "vpor        %%ymm6,%%ymm1,%%ymm1          \n"
-      "vpor        %%ymm6,%%ymm2,%%ymm2          \n"
-      "vpor        %%ymm6,%%ymm3,%%ymm3          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "vmovdqu     %%ymm2,0x40(%1)               \n"
-      "vmovdqu     %%ymm3,0x60(%1)               \n"
-      "lea         0x80(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
-      : "+r"(src_rgb24),                 // %0
-        "+r"(dst_argb),                  // %1
-        "+r"(width)                      // %2
-      : "m"(kShuffleMaskRGB24ToARGB[0])  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
-}
-#endif  // HAS_RGB24TOARGBROW_AVX2
-
 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
   asm volatile(
       "pcmpeqb     %%xmm6,%%xmm6                 \n"  // 0xff000000
@@ -362,10 +266,7 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
 static const uint32_t kPermdRAWToARGB_AVX512BW[16] = {
     0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
 
-void RGBToARGBRow_AVX512BW(const uint8_t* src_raw,
-                           uint8_t* dst_argb,
-                           const uint32_t* shuffler,
-                           int width) {
+void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint32_t* shuffler, int width) {
   asm volatile(
       "vpternlogd  $0xff,%%zmm6,%%zmm6,%%zmm6    \n"  // 0xffffffff
       "vpslld      $0x18,%%zmm6,%%zmm6           \n"  // 0xff000000
@@ -406,25 +307,19 @@ void RGBToARGBRow_AVX512BW(const uint8_t* src_raw,
         "+r"(width)                     // %2
       : "m"(kPermdRAWToARGB_AVX512BW),  // %3
         "m"(*shuffler)                  // %4
-      : "memory", "cc", "rax", "k1", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
-        "xmm5", "xmm6");
+      : "memory", "cc", "rax", "k1", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6");
 }
 
-void RAWToARGBRow_AVX512BW(const uint8_t* src_raw,
-                           uint8_t* dst_argb,
-                           int width) {
-  RGBToARGBRow_AVX512BW(src_raw, dst_argb,
-                        (const uint32_t*)&kShuffleMaskRAWToARGB, width);
+void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  RGBToARGBRow_AVX512BW(src_raw, dst_argb, (const uint32_t*)&kShuffleMaskRAWToARGB, width);
 }
 
-void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24,
-                             uint8_t* dst_argb,
-                             int width) {
-  RGBToARGBRow_AVX512BW(src_rgb24, dst_argb,
-                        (const uint32_t*)&kShuffleMaskRGB24ToARGB[0], width);
+void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
+  RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, (const uint32_t*)&kShuffleMaskRGB24ToARGB, width);
 }
 #endif
 
+
 // Same code as RAWToARGB with different shuffler and A in low bits
 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
   asm volatile(
@@ -496,47 +391,46 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-#ifdef HAS_RGB565TOARGBROW_AVX2
-void RGB565ToARGBRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
       "mov         $0x1080108,%%eax              \n"
-      "vmovd       %%eax,%%xmm5                  \n"
-      "vpbroadcastd %%xmm5,%%ymm5                \n"
+      "movd        %%eax,%%xmm5                  \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
       "mov         $0x20802080,%%eax             \n"
-      "vmovd       %%eax,%%xmm6                  \n"
-      "vpbroadcastd %%xmm6,%%ymm6                \n"
-      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
-      "vpsllw      $0xb,%%ymm3,%%ymm3            \n"
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
-      "vpsllw      $10,%%ymm4,%%ymm4             \n"
-      "vpsrlw      $5,%%ymm4,%%ymm4              \n"
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"
-      "vpsllw      $0x8,%%ymm7,%%ymm7            \n"
+      "movd        %%eax,%%xmm6                  \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psllw       $0xb,%%xmm3                   \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psllw       $10,%%xmm4                    \n"
+      "psrlw       $5,%%xmm4                     \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psllw       $0x8,%%xmm7                   \n"
       "sub         %0,%1                         \n"
       "sub         %0,%1                         \n"
 
       LABELALIGN
       "1:          \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vpand       %%ymm3,%%ymm0,%%ymm1          \n"
-      "vpsllw      $0xb,%%ymm0,%%ymm2            \n"
-      "vpmulhuw    %%ymm5,%%ymm1,%%ymm1          \n"
-      "vpmulhuw    %%ymm5,%%ymm2,%%ymm2          \n"
-      "vpsllw      $0x8,%%ymm1,%%ymm1            \n"
-      "vpor        %%ymm2,%%ymm1,%%ymm1          \n"
-      "vpand       %%ymm4,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"
-      "vpor        %%ymm7,%%ymm0,%%ymm0          \n"
-      "vpunpcklbw  %%ymm0,%%ymm1,%%ymm2          \n"
-      "vpunpckhbw  %%ymm0,%%ymm1,%%ymm1          \n"
-      "vperm2i128  $0x20,%%ymm1,%%ymm2,%%ymm0    \n"
-      "vperm2i128  $0x31,%%ymm1,%%ymm2,%%ymm1    \n"
-      "vmovdqu     %%ymm0,(%1,%0,2)              \n"
-      "vmovdqu     %%ymm1,0x20(%1,%0,2)          \n"
-      "lea         0x20(%0),%0                   \n"
-      "sub         $0x10,%2                      \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "psllw       $0xb,%%xmm2                   \n"
+      "pmulhuw     %%xmm5,%%xmm1                 \n"
+      "pmulhuw     %%xmm5,%%xmm2                 \n"
+      "psllw       $0x8,%%xmm1                   \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "pmulhuw     %%xmm6,%%xmm0                 \n"
+      "por         %%xmm7,%%xmm0                 \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklbw   %%xmm0,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
+      "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
-      "vzeroupper  \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -544,50 +438,50 @@ void RGB565ToARGBRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
         "xmm6", "xmm7");
 }
-#endif
 
-#ifdef HAS_ARGB1555TOARGBROW_AVX2
-void ARGB1555ToARGBRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
       "mov         $0x1080108,%%eax              \n"
-      "vmovd       %%eax,%%xmm5                  \n"
-      "vpbroadcastd %%xmm5,%%ymm5                \n"
+      "movd        %%eax,%%xmm5                  \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
       "mov         $0x42004200,%%eax             \n"
-      "vmovd       %%eax,%%xmm6                  \n"
-      "vpbroadcastd %%xmm6,%%ymm6                \n"
-      "vpcmpeqb    %%ymm3,%%ymm3,%%ymm3          \n"
-      "vpsllw      $0xb,%%ymm3,%%ymm3            \n"
-      "vpsrlw      $0x6,%%ymm3,%%ymm4            \n"
-      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"
-      "vpsllw      $0x8,%%ymm7,%%ymm7            \n"
+      "movd        %%eax,%%xmm6                  \n"
+      "pshufd      $0x0,%%xmm6,%%xmm6            \n"
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psllw       $0xb,%%xmm3                   \n"
+      "movdqa      %%xmm3,%%xmm4                 \n"
+      "psrlw       $0x6,%%xmm4                   \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psllw       $0x8,%%xmm7                   \n"
       "sub         %0,%1                         \n"
       "sub         %0,%1                         \n"
 
       LABELALIGN
       "1:          \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vpsllw      $0x1,%%ymm0,%%ymm1            \n"
-      "vpsllw      $0xb,%%ymm0,%%ymm2            \n"
-      "vpand       %%ymm3,%%ymm1,%%ymm1          \n"
-      "vpmulhuw    %%ymm5,%%ymm2,%%ymm2          \n"
-      "vpmulhuw    %%ymm5,%%ymm1,%%ymm1          \n"
-      "vpsllw      $0x8,%%ymm1,%%ymm1            \n"
-      "vpor        %%ymm2,%%ymm1,%%ymm1          \n"
-      "vpsraw      $0x8,%%ymm0,%%ymm2            \n"
-      "vpand       %%ymm4,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm6,%%ymm0,%%ymm0          \n"
-      "vpand       %%ymm7,%%ymm2,%%ymm2          \n"
-      "vpor        %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpunpcklbw  %%ymm0,%%ymm1,%%ymm2          \n"
-      "vpunpckhbw  %%ymm0,%%ymm1,%%ymm1          \n"
-      "vperm2i128  $0x20,%%ymm1,%%ymm2,%%ymm0    \n"
-      "vperm2i128  $0x31,%%ymm1,%%ymm2,%%ymm1    \n"
-      "vmovdqu     %%ymm0,(%1,%0,2)              \n"
-      "vmovdqu     %%ymm1,0x20(%1,%0,2)          \n"
-      "lea         0x20(%0),%0                   \n"
-      "sub         $0x10,%2                      \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "psllw       $0x1,%%xmm1                   \n"
+      "psllw       $0xb,%%xmm2                   \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "pmulhuw     %%xmm5,%%xmm2                 \n"
+      "pmulhuw     %%xmm5,%%xmm1                 \n"
+      "psllw       $0x8,%%xmm1                   \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "psraw       $0x8,%%xmm2                   \n"
+      "pmulhuw     %%xmm6,%%xmm0                 \n"
+      "pand        %%xmm7,%%xmm2                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "punpcklbw   %%xmm0,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x00(%1,%0,2)          \n"
+      "movdqu      %%xmm2,0x10(%1,%0,2)          \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
-      "vzeroupper  \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
@@ -595,75 +489,74 @@ void ARGB1555ToARGBRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
         "xmm6", "xmm7");
 }
-#endif
 
-#ifdef HAS_ARGB4444TOARGBROW_AVX2
-void ARGB4444ToARGBRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
-      "mov         $0x0f0f0f0f,%%eax             \n"
-      "vmovd       %%eax,%%xmm4                  \n"
-      "vpbroadcastd %%xmm4,%%ymm4                \n"
-      "vpslld      $0x4,%%ymm4,%%ymm5            \n"
+      "mov         $0xf0f0f0f,%%eax              \n"
+      "movd        %%eax,%%xmm4                  \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "pslld       $0x4,%%xmm5                   \n"
       "sub         %0,%1                         \n"
       "sub         %0,%1                         \n"
 
       LABELALIGN
       "1:          \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vpand       %%ymm5,%%ymm0,%%ymm2          \n"
-      "vpand       %%ymm4,%%ymm0,%%ymm0          \n"
-      "vpsllw      $0x4,%%ymm0,%%ymm1            \n"
-      "vpsrlw      $0x4,%%ymm2,%%ymm3            \n"
-      "vpor        %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpor        %%ymm3,%%ymm2,%%ymm2          \n"
-      "vpunpckhbw  %%ymm2,%%ymm0,%%ymm1          \n"
-      "vpunpcklbw  %%ymm2,%%ymm0,%%ymm0          \n"
-      "vperm2i128  $0x20,%%ymm1,%%ymm0,%%ymm2    \n"
-      "vperm2i128  $0x31,%%ymm1,%%ymm0,%%ymm1    \n"
-      "vmovdqu     %%ymm2,(%1,%0,2)              \n"
-      "vmovdqu     %%ymm1,0x20(%1,%0,2)          \n"
-      "lea         0x20(%0),%0                   \n"
-      "sub         $0x10,%2                      \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pand        %%xmm4,%%xmm0                 \n"
+      "pand        %%xmm5,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "psllw       $0x4,%%xmm1                   \n"
+      "psrlw       $0x4,%%xmm3                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "movdqu      %%xmm0,0x00(%1,%0,2)          \n"
+      "movdqu      %%xmm1,0x10(%1,%0,2)          \n"
+      "lea         0x10(%0),%0                   \n"
+      "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
-      "vzeroupper  \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
       :
       : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
-#endif
 
 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile("movdqa      %3,%%xmm6                     \n"
+      asm volatile("movdqa      %3,%%xmm6                     \n"
 
                LABELALIGN
-               "1:          \n"
-               "movdqu      (%0),%%xmm0                   \n"
-               "movdqu      0x10(%0),%%xmm1               \n"
-               "movdqu      0x20(%0),%%xmm2               \n"
-               "movdqu      0x30(%0),%%xmm3               \n"
-               "lea         0x40(%0),%0                   \n"
-               "pshufb      %%xmm6,%%xmm0                 \n"
-               "pshufb      %%xmm6,%%xmm1                 \n"
-               "pshufb      %%xmm6,%%xmm2                 \n"
-               "pshufb      %%xmm6,%%xmm3                 \n"
-               "movdqa      %%xmm1,%%xmm4                 \n"
-               "psrldq      $0x4,%%xmm1                   \n"
-               "pslldq      $0xc,%%xmm4                   \n"
-               "movdqa      %%xmm2,%%xmm5                 \n"
-               "por         %%xmm4,%%xmm0                 \n"
-               "pslldq      $0x8,%%xmm5                   \n"
-               "movdqu      %%xmm0,(%1)                   \n"
-               "por         %%xmm5,%%xmm1                 \n"
-               "psrldq      $0x8,%%xmm2                   \n"
-               "pslldq      $0x4,%%xmm3                   \n"
-               "por         %%xmm3,%%xmm2                 \n"
-               "movdqu      %%xmm1,0x10(%1)               \n"
-               "movdqu      %%xmm2,0x20(%1)               \n"
-               "lea         0x30(%1),%1                   \n"
-               "sub         $0x10,%2                      \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm3               \n"
+      "lea         0x40(%0),%0                   \n"
+      "pshufb      %%xmm6,%%xmm0                 \n"
+      "pshufb      %%xmm6,%%xmm1                 \n"
+      "pshufb      %%xmm6,%%xmm2                 \n"
+      "pshufb      %%xmm6,%%xmm3                 \n"
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "psrldq      $0x4,%%xmm1                   \n"
+      "pslldq      $0xc,%%xmm4                   \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "por         %%xmm4,%%xmm0                 \n"
+      "pslldq      $0x8,%%xmm5                   \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "psrldq      $0x8,%%xmm2                   \n"
+      "pslldq      $0x4,%%xmm3                   \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "lea         0x30(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
                : "+r"(src),                    // %0
                  "+r"(dst),                    // %1
                  "+r"(width)                   // %2
@@ -673,35 +566,35 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile("movdqa      %3,%%xmm6                     \n"
+      asm volatile("movdqa      %3,%%xmm6                     \n"
 
                LABELALIGN
-               "1:          \n"
-               "movdqu      (%0),%%xmm0                   \n"
-               "movdqu      0x10(%0),%%xmm1               \n"
-               "movdqu      0x20(%0),%%xmm2               \n"
-               "movdqu      0x30(%0),%%xmm3               \n"
-               "lea         0x40(%0),%0                   \n"
-               "pshufb      %%xmm6,%%xmm0                 \n"
-               "pshufb      %%xmm6,%%xmm1                 \n"
-               "pshufb      %%xmm6,%%xmm2                 \n"
-               "pshufb      %%xmm6,%%xmm3                 \n"
-               "movdqa      %%xmm1,%%xmm4                 \n"
-               "psrldq      $0x4,%%xmm1                   \n"
-               "pslldq      $0xc,%%xmm4                   \n"
-               "movdqa      %%xmm2,%%xmm5                 \n"
-               "por         %%xmm4,%%xmm0                 \n"
-               "pslldq      $0x8,%%xmm5                   \n"
-               "movdqu      %%xmm0,(%1)                   \n"
-               "por         %%xmm5,%%xmm1                 \n"
-               "psrldq      $0x8,%%xmm2                   \n"
-               "pslldq      $0x4,%%xmm3                   \n"
-               "por         %%xmm3,%%xmm2                 \n"
-               "movdqu      %%xmm1,0x10(%1)               \n"
-               "movdqu      %%xmm2,0x20(%1)               \n"
-               "lea         0x30(%1),%1                   \n"
-               "sub         $0x10,%2                      \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "movdqu      0x20(%0),%%xmm2               \n"
+      "movdqu      0x30(%0),%%xmm3               \n"
+      "lea         0x40(%0),%0                   \n"
+      "pshufb      %%xmm6,%%xmm0                 \n"
+      "pshufb      %%xmm6,%%xmm1                 \n"
+      "pshufb      %%xmm6,%%xmm2                 \n"
+      "pshufb      %%xmm6,%%xmm3                 \n"
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "psrldq      $0x4,%%xmm1                   \n"
+      "pslldq      $0xc,%%xmm4                   \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "por         %%xmm4,%%xmm0                 \n"
+      "pslldq      $0x8,%%xmm5                   \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "por         %%xmm5,%%xmm1                 \n"
+      "psrldq      $0x8,%%xmm2                   \n"
+      "pslldq      $0x4,%%xmm3                   \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "movdqu      %%xmm2,0x20(%1)               \n"
+      "lea         0x30(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
                : "+r"(src),                  // %0
                  "+r"(dst),                  // %1
                  "+r"(width)                 // %2
@@ -853,6 +746,90 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 }
 #endif
 
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psrld       $0x1b,%%xmm3                  \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrld       $0x1a,%%xmm4                  \n"
+      "pslld       $0x5,%%xmm4                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0xb,%%xmm5                   \n"
+
+      LABELALIGN
+      "1:          \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pslld       $0x8,%%xmm0                   \n"
+      "psrld       $0x3,%%xmm1                   \n"
+      "psrld       $0x5,%%xmm2                   \n"
+      "psrad       $0x10,%%xmm0                  \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "pand        %%xmm4,%%xmm2                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+                                uint8_t* dst,
+                                uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "movd        %3,%%xmm6                     \n"
+      "punpcklbw   %%xmm6,%%xmm6                 \n"
+      "movdqa      %%xmm6,%%xmm7                 \n"
+      "punpcklwd   %%xmm6,%%xmm6                 \n"
+      "punpckhwd   %%xmm7,%%xmm7                 \n"
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      "psrld       $0x1b,%%xmm3                  \n"
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrld       $0x1a,%%xmm4                  \n"
+      "pslld       $0x5,%%xmm4                   \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pslld       $0xb,%%xmm5                   \n"
+
+      LABELALIGN
+      "1:          \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "paddusb     %%xmm6,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "pslld       $0x8,%%xmm0                   \n"
+      "psrld       $0x3,%%xmm1                   \n"
+      "psrld       $0x5,%%xmm2                   \n"
+      "psrad       $0x10,%%xmm0                  \n"
+      "pand        %%xmm3,%%xmm1                 \n"
+      "pand        %%xmm4,%%xmm2                 \n"
+      "pand        %%xmm5,%%xmm0                 \n"
+      "por         %%xmm2,%%xmm1                 \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width)   // %2
+      : "m"(dither4)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+
 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                 uint8_t* dst,
@@ -899,6 +876,75 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
 }
 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
 
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psrld       $0x1b,%%xmm4                  \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "pslld       $0x5,%%xmm5                   \n"
+      "movdqa      %%xmm4,%%xmm6                 \n"
+      "pslld       $0xa,%%xmm6                   \n"
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "pslld       $0xf,%%xmm7                   \n"
+
+      LABELALIGN
+      "1:          \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "movdqa      %%xmm0,%%xmm3                 \n"
+      "psrad       $0x10,%%xmm0                  \n"
+      "psrld       $0x3,%%xmm1                   \n"
+      "psrld       $0x6,%%xmm2                   \n"
+      "psrld       $0x9,%%xmm3                   \n"
+      "pand        %%xmm7,%%xmm0                 \n"
+      "pand        %%xmm4,%%xmm1                 \n"
+      "pand        %%xmm5,%%xmm2                 \n"
+      "pand        %%xmm6,%%xmm3                 \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "por         %%xmm3,%%xmm2                 \n"
+      "por         %%xmm2,%%xmm0                 \n"
+      "packssdw    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      "psllw       $0xc,%%xmm4                   \n"
+      "movdqa      %%xmm4,%%xmm3                 \n"
+      "psrlw       $0x8,%%xmm3                   \n"
+
+      LABELALIGN
+      "1:          \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "pand        %%xmm3,%%xmm0                 \n"
+      "pand        %%xmm4,%%xmm1                 \n"
+      "psrlq       $0x4,%%xmm0                   \n"
+      "psrlq       $0x8,%%xmm1                   \n"
+      "por         %%xmm1,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "lea         0x10(%0),%0                   \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
 #endif  // HAS_RGB24TOARGBROW_SSSE3
 
 /*
@@ -1166,21 +1212,21 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
 void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile("movdqa      %3,%%xmm2                     \n"
+      asm volatile("movdqa      %3,%%xmm2                     \n"
 
                LABELALIGN
-               "1:          \n"
-               "movdqu      (%0),%%xmm0                   \n"
-               "movdqu      0x10(%0),%%xmm1               \n"
-               "psrlw       $8,%%xmm0                     \n"
-               "psrlw       $8,%%xmm1                     \n"
-               "packuswb    %%xmm1,%%xmm0                 \n"
-               "pshufb      %%xmm2,%%xmm0                 \n"
-               "movdqu      %%xmm0,(%1)                   \n"
-               "lea         0x20(%0),%0                   \n"
-               "lea         0x10(%1),%1                   \n"
-               "sub         $0x4,%2                       \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "psrlw       $8,%%xmm0                     \n"
+      "psrlw       $8,%%xmm1                     \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "pshufb      %%xmm2,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
                : "+r"(src_ab64),          // %0
                  "+r"(dst_argb),          // %1
                  "+r"(width)              // %2
@@ -1271,21 +1317,21 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
 void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile("vbroadcasti128 %3,%%ymm2                  \n" LABELALIGN
-               "1:          \n"
-               "vmovdqu     (%0),%%ymm0                   \n"
-               "vmovdqu     0x20(%0),%%ymm1               \n"
-               "vpsrlw      $8,%%ymm0,%%ymm0              \n"
-               "vpsrlw      $8,%%ymm1,%%ymm1              \n"
-               "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-               "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-               "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
-               "vmovdqu     %%ymm0,(%1)                   \n"
-               "lea         0x40(%0),%0                   \n"
-               "lea         0x20(%1),%1                   \n"
-               "sub         $0x8,%2                       \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      asm volatile("vbroadcasti128 %3,%%ymm2                  \n" LABELALIGN
+      "1:          \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vpsrlw      $8,%%ymm0,%%ymm0              \n"
+      "vpsrlw      $8,%%ymm1,%%ymm1              \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x40(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : "+r"(src_ab64),          // %0
                  "+r"(dst_argb),          // %1
                  "+r"(width)              // %2
@@ -1465,7 +1511,9 @@ void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb,
       "movdqa      %%xmm4,%%xmm6                 \n"
       "pmaddubsw   %%xmm5,%%xmm6                 \n"
       "phaddw      %%xmm6,%%xmm6                 \n"
-      "psubw       %%xmm6,%%xmm7                 \n" LABELALIGN "" RGBTOY(xmm7)
+      "psubw       %%xmm6,%%xmm7                 \n"
+      LABELALIGN ""
+      RGBTOY(xmm7)
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1489,8 +1537,10 @@ void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
       "vpmaddubsw  %%ymm5,%%ymm4,%%ymm6          \n"
       "vphaddw     %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsubw      %%ymm6,%%ymm7,%%ymm7          \n"
-      "vmovdqa     %4,%%ymm6                     \n" LABELALIGN
-      "" RGBTOY_AVX2(ymm7) "vzeroupper  \n"
+      "vmovdqa     %4,%%ymm6                     \n"
+      LABELALIGN ""
+      RGBTOY_AVX2(ymm7)
+      "vzeroupper  \n"
       : "+r"(src_argb),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
@@ -1501,9 +1551,8 @@ void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
 }
 #endif
 
-#if defined(HAS_ARGBTOYROW_AVX512BW) || \
-    defined(HAS_ARGBTOUV444ROW_AVX512BW) || defined(HAS_ARGBTOUVROW_AVX512BW)
-static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8,  12, 1, 5, 9,  13,
+#if defined(HAS_ARGBTOYROW_AVX512BW) || defined(HAS_ARGBTOUV444ROW_AVX512BW) || defined(HAS_ARGBTOUVROW_AVX512BW)
+static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8, 12, 1, 5, 9, 13,
                                                     2, 6, 10, 14, 3, 7, 11, 15};
 #endif
 
@@ -1521,14 +1570,15 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb,
       "vpternlogd  $0xff,%%zmm16,%%zmm16,%%zmm16 \n"
       "vpsllw      $15,%%zmm16,%%zmm5            \n"
       "vpacksswb   %%zmm5,%%zmm5,%%zmm5          \n"
-      "vpsrlw      $15,%%zmm16,%%zmm16           \n"  // zmm16 = 1
+      "vpsrlw      $15,%%zmm16,%%zmm16           \n" // zmm16 = 1
       "vbroadcasti64x4 0(%3),%%zmm4              \n"
       "vbroadcasti64x4 0x60(%3),%%zmm7           \n"
       "vpmaddubsw  %%zmm5,%%zmm4,%%zmm6          \n"
       "vpmaddwd    %%zmm16,%%zmm6,%%zmm6         \n"
       "vpackssdw   %%zmm6,%%zmm6,%%zmm6          \n"
       "vpsubw      %%zmm6,%%zmm7,%%zmm7          \n"
-      "vmovups     %4,%%zmm6                     \n" LABELALIGN
+      "vmovups     %4,%%zmm6                     \n"
+      LABELALIGN
       "1:          \n"
       "vmovups     (%0),%%zmm0                   \n"
       "vmovups     0x40(%0),%%zmm1               \n"
@@ -1560,13 +1610,13 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb,
       "sub         $0x40,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper  \n"
-      : "+r"(src_argb),              // %0
-        "+r"(dst_y),                 // %1
-        "+r"(width)                  // %2
-      : "r"(c),                      // %3
-        "m"(kPermdARGBToY_AVX512BW)  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7", "xmm16");
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "r"(c),          // %3
+        "m"(kPermdARGBToY_AVX512BW) // %4
+      : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6",
+        "zmm7", "zmm16");
 }
 #endif
 
@@ -1707,8 +1757,8 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
 #endif
       : "r"(c),                 // %4
         "m"(kPermdARGBToY_AVX)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
+      : "memory", "cc", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+        "ymm7");
 }
 #endif  // HAS_ARGBTOUV444ROW_AVX2
 
@@ -1722,8 +1772,8 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb,
   asm volatile(
       "vbroadcasti64x4 0x20(%4),%%zmm3               \n"  // kRGBToU
       "vbroadcasti64x4 0x40(%4),%%zmm4               \n"  // kRGBToV
-      "vpternlogd  $0xff,%%zmm16,%%zmm16,%%zmm16 \n"      // -1
-      "vpsllw      $15,%%zmm16,%%zmm5            \n"      // 0x8000
+      "vpternlogd  $0xff,%%zmm16,%%zmm16,%%zmm16 \n"  // -1
+      "vpsllw      $15,%%zmm16,%%zmm5            \n"  // 0x8000
       "vmovups     %5,%%zmm7                     \n"
       "sub         %1,%2                         \n"
 
@@ -1787,8 +1837,8 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb,
 #endif
       : "r"(c),                      // %4
         "m"(kPermdARGBToY_AVX512BW)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7", "xmm16");
+      : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6",
+        "zmm7", "zmm16");
 }
 #endif  // HAS_ARGBTOUV444ROW_AVX512BW
 
@@ -1883,8 +1933,8 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
                             int width,
                             const struct ArgbConstants* c) {
   asm volatile(
-      "vbroadcasti128 0x20(%5),%%ymm4           \n"   // RGBToU
-      "vbroadcasti128 0x40(%5),%%ymm5           \n"   // RGBToV
+      "vbroadcasti128 0x20(%5),%%ymm4           \n"  // RGBToU
+      "vbroadcasti128 0x40(%5),%%ymm5           \n"  // RGBToV
       "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 0x0101
       "vpabsb      %%ymm6,%%ymm6                 \n"
       "vmovdqa     %6,%%ymm7                     \n"  // kShuffleAARRGGBB
@@ -1964,6 +2014,7 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBTOUV444ROW_SSSE3
 
+
 #ifdef HAS_ARGBTOYROW_AVX2
 void RGBAToYRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
   ARGBToYMatrixRow_AVX2(src_rgba, dst_y, width, &kRgbaI601Constants);
@@ -1976,6 +2027,7 @@ void BGRAToYRow_AVX2(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
 }
 #endif
 
+
 #ifdef HAS_ARGBTOYROW_AVX512BW
 void ARGBToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
   ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kArgbI601Constants);
@@ -2183,8 +2235,8 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb,
       "vbroadcasti64x4 0x20(%5),%%zmm4               \n"  // RGBToU
       "vbroadcasti64x4 0x40(%5),%%zmm5               \n"  // RGBToV
       "vpternlogd  $0xff,%%zmm16,%%zmm16,%%zmm16 \n"
-      "vpabsb      %%zmm16,%%zmm6                \n"      // 0x0101
-      "vpsllw      $15,%%zmm16,%%zmm17           \n"      // 0x8000
+      "vpabsb      %%zmm16,%%zmm6                \n"  // 0x0101
+      "vpsllw      $15,%%zmm16,%%zmm17           \n"  // 0x8000
       "vbroadcasti64x4 %6,%%zmm7                     \n"  // kShuffleAARRGGBB
       "vmovups     %7,%%zmm18                    \n"  // kPermdARGBToY_AVX512BW
       "vmovups     %8,%%zmm19                    \n"  // kPermdARGBToUV_AVX512BW
@@ -2218,8 +2270,7 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb,
       "vpmaddubsw  %%zmm5,%%zmm0,%%zmm0          \n"  // 16 V
       "vpmaddwd    %%zmm16,%%zmm1,%%zmm1         \n"
       "vpmaddwd    %%zmm16,%%zmm0,%%zmm0         \n"
-      "vpackssdw   %%zmm0,%%zmm1,%%zmm0          \n"  // mutates (U in lower, V
-                                                      // in upper)
+      "vpackssdw   %%zmm0,%%zmm1,%%zmm0          \n"  // mutates (U in lower, V in upper)
       "vpaddw      %%zmm17,%%zmm0,%%zmm0         \n"
       "vpsrlw      $0x8,%%zmm0,%%zmm0            \n"
       "vpackuswb   %%zmm0,%%zmm0,%%zmm0          \n"  // mutates
@@ -2247,8 +2298,8 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb,
         "m"(kShuffleAARRGGBB),              // %6
         "m"(kPermdARGBToY_AVX512BW),        // %7
         "m"(kPermdARGBToUV_AVX512BW)        // %8
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7", "xmm16", "xmm17", "xmm18", "xmm19");
+      : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6",
+        "zmm7", "zmm16", "zmm17", "zmm18", "zmm19");
 }
 
 void ARGBToUVRow_AVX512BW(const uint8_t* src_argb,
@@ -2669,12 +2720,12 @@ void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
   asm volatile(YUVTORGB_SETUP(
-                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-               LABELALIGN "1:          \n" READYUVA444 YUVTORGB(yuvconstants)
+      LABELALIGN "1:          \n" READYUVA444 YUVTORGB(yuvconstants)
                    STOREARGB
-               "subl        $0x8,%[width]                 \n"
-               "jg          1b                            \n"
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),        // %[y_buf]
                  [u_buf] "+r"(u_buf),        // %[u_buf]
                  [v_buf] "+r"(v_buf),        // %[v_buf]
@@ -2995,12 +3046,12 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
   asm volatile(YUVTORGB_SETUP(
-                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-               LABELALIGN "1:          \n" READYUVA210 YUVTORGB(yuvconstants)
+      LABELALIGN "1:          \n" READYUVA210 YUVTORGB(yuvconstants)
                    STOREARGB
-               "subl        $0x8,%[width]                 \n"
-               "jg          1b                            \n"
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),  // %[y_buf]
                  [u_buf] "+r"(u_buf),  // %[u_buf]
                  [v_buf] "+r"(v_buf),  // %[v_buf]
@@ -3027,12 +3078,12 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
   asm volatile(YUVTORGB_SETUP(
-                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-               LABELALIGN "1:          \n" READYUVA410 YUVTORGB(yuvconstants)
+      LABELALIGN "1:          \n" READYUVA410 YUVTORGB(yuvconstants)
                    STOREARGB
-               "subl        $0x8,%[width]                 \n"
-               "jg          1b                            \n"
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),  // %[y_buf]
                  [u_buf] "+r"(u_buf),  // %[u_buf]
                  [v_buf] "+r"(v_buf),  // %[v_buf]
@@ -3093,12 +3144,12 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
   asm volatile(YUVTORGB_SETUP(
-                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-               LABELALIGN "1:          \n" READYUVA422 YUVTORGB(yuvconstants)
+      LABELALIGN "1:          \n" READYUVA422 YUVTORGB(yuvconstants)
                    STOREARGB
-               "subl        $0x8,%[width]                 \n"
-               "jg          1b                            \n"
+      "subl        $0x8,%[width]                 \n"
+      "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),        // %[y_buf]
                  [u_buf] "+r"(u_buf),        // %[u_buf]
                  [v_buf] "+r"(v_buf),        // %[v_buf]
@@ -3121,12 +3172,12 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile(YUVTORGB_SETUP(
-                   yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
-               LABELALIGN "1:          \n" READNV12 YUVTORGB(yuvconstants)
+      LABELALIGN "1:          \n" READNV12 YUVTORGB(yuvconstants)
                    STOREARGB
-               "sub         $0x8,%[width]                 \n"
-               "jg          1b                            \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),              // %[y_buf]
                  [uv_buf] "+r"(uv_buf),            // %[uv_buf]
                  [dst_argb] "+r"(dst_argb),        // %[dst_argb]
@@ -3142,12 +3193,12 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile(YUVTORGB_SETUP(
-                   yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
-               LABELALIGN "1:          \n" READNV21 YUVTORGB(yuvconstants)
+      LABELALIGN "1:          \n" READNV21 YUVTORGB(yuvconstants)
                    STOREARGB
-               "sub         $0x8,%[width]                 \n"
-               "jg          1b                            \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),               // %[y_buf]
                  [vu_buf] "+r"(vu_buf),             // %[vu_buf]
                  [dst_argb] "+r"(dst_argb),         // %[dst_argb]
@@ -3165,7 +3216,7 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
   asm volatile(
       "movdqa      %[kShuffleYUY2Y],%%xmm6       \n"
       "movdqa      %[kShuffleYUY2UV],%%xmm7      \n" YUVTORGB_SETUP(
-          yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       LABELALIGN "1:          \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB
       "sub         $0x8,%[width]                 \n"
@@ -3186,7 +3237,7 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
   asm volatile(
       "movdqa      %[kShuffleUYVYY],%%xmm6       \n"
       "movdqa      %[kShuffleUYVYUV],%%xmm7      \n" YUVTORGB_SETUP(
-          yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       LABELALIGN "1:          \n" READUYVY YUVTORGB(yuvconstants) STOREARGB
       "sub         $0x8,%[width]                 \n"
@@ -3206,12 +3257,12 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile(YUVTORGB_SETUP(
-                   yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
-               LABELALIGN "1:          \n" READP210 YUVTORGB(yuvconstants)
+      LABELALIGN "1:          \n" READP210 YUVTORGB(yuvconstants)
                    STOREARGB
-               "sub         $0x8,%[width]                 \n"
-               "jg          1b                            \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),              // %[y_buf]
                  [uv_buf] "+r"(uv_buf),            // %[u_buf]
                  [dst_argb] "+r"(dst_argb),        // %[dst_argb]
@@ -3227,12 +3278,12 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile(YUVTORGB_SETUP(
-                   yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
-               LABELALIGN "1:          \n" READP410 YUVTORGB(yuvconstants)
+      LABELALIGN "1:          \n" READP410 YUVTORGB(yuvconstants)
                    STOREARGB
-               "sub         $0x8,%[width]                 \n"
-               "jg          1b                            \n"
+      "sub         $0x8,%[width]                 \n"
+      "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),              // %[y_buf]
                  [uv_buf] "+r"(uv_buf),            // %[u_buf]
                  [dst_argb] "+r"(dst_argb),        // %[dst_argb]
@@ -4055,13 +4106,13 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-               LABELALIGN "1:          \n" READYUVA210_AVX2 YUVTORGB_AVX2(
+      LABELALIGN "1:          \n" READYUVA210_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-               "subl        $0x10,%[width]                \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "subl        $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
 
                : [y_buf] "+r"(y_buf),        // %[y_buf]
                  [u_buf] "+r"(u_buf),        // %[u_buf]
@@ -4090,13 +4141,13 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-               LABELALIGN "1:          \n" READYUVA410_AVX2 YUVTORGB_AVX2(
+      LABELALIGN "1:          \n" READYUVA410_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-               "subl        $0x10,%[width]                \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "subl        $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
 
                : [y_buf] "+r"(y_buf),        // %[y_buf]
                  [u_buf] "+r"(u_buf),        // %[u_buf]
@@ -4165,13 +4216,13 @@ void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-               LABELALIGN "1:          \n" READYUVA444_AVX2 YUVTORGB_AVX2(
+      LABELALIGN "1:          \n" READYUVA444_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-               "subl        $0x10,%[width]                \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "subl        $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : [y_buf] "+r"(y_buf),        // %[y_buf]
                  [u_buf] "+r"(u_buf),        // %[u_buf]
                  [v_buf] "+r"(v_buf),        // %[v_buf]
@@ -4199,13 +4250,13 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-               LABELALIGN "1:          \n" READYUVA422_AVX2 YUVTORGB_AVX2(
+      LABELALIGN "1:          \n" READYUVA422_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-               "subl        $0x10,%[width]                \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "subl        $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : [y_buf] "+r"(y_buf),        // %[y_buf]
                  [u_buf] "+r"(u_buf),        // %[u_buf]
                  [v_buf] "+r"(v_buf),        // %[v_buf]
@@ -4275,13 +4326,13 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-                   yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
-               LABELALIGN "1:          \n" READNV12_AVX2 YUVTORGB_AVX2(
+      LABELALIGN "1:          \n" READNV12_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-               "sub         $0x10,%[width]                \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : [y_buf] "+r"(y_buf),              // %[y_buf]
                  [uv_buf] "+r"(uv_buf),            // %[uv_buf]
                  [dst_argb] "+r"(dst_argb),        // %[dst_argb]
@@ -4301,13 +4352,13 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-                   yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
-               LABELALIGN "1:          \n" READNV21_AVX2 YUVTORGB_AVX2(
+      LABELALIGN "1:          \n" READNV21_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-               "sub         $0x10,%[width]                \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : [y_buf] "+r"(y_buf),               // %[y_buf]
                  [vu_buf] "+r"(vu_buf),             // %[vu_buf]
                  [dst_argb] "+r"(dst_argb),         // %[dst_argb]
@@ -4329,7 +4380,7 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
   asm volatile(
       "vbroadcasti128 %[kShuffleYUY2Y],%%ymm6    \n"
       "vbroadcasti128 %[kShuffleYUY2UV],%%ymm7   \n" YUVTORGB_SETUP_AVX2(
-          yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN "1:          \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants)
           STOREARGB_AVX2
@@ -4356,7 +4407,7 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
   asm volatile(
       "vbroadcasti128 %[kShuffleUYVYY],%%ymm6    \n"
       "vbroadcasti128 %[kShuffleUYVYUV],%%ymm7   \n" YUVTORGB_SETUP_AVX2(
-          yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN "1:          \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants)
           STOREARGB_AVX2
@@ -4382,13 +4433,13 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-                   yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
-               LABELALIGN "1:          \n" READP210_AVX2 YUVTORGB_AVX2(
+      LABELALIGN "1:          \n" READP210_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-               "sub         $0x10,%[width]                \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : [y_buf] "+r"(y_buf),              // %[y_buf]
                  [uv_buf] "+r"(uv_buf),            // %[uv_buf]
                  [dst_argb] "+r"(dst_argb),        // %[dst_argb]
@@ -4408,13 +4459,13 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-                   yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
-               LABELALIGN "1:          \n" READP410_AVX2 YUVTORGB_AVX2(
+      LABELALIGN "1:          \n" READP410_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-               "sub         $0x10,%[width]                \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "sub         $0x10,%[width]                \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : [y_buf] "+r"(y_buf),              // %[y_buf]
                  [uv_buf] "+r"(uv_buf),            // %[uv_buf]
                  [dst_argb] "+r"(dst_argb),        // %[dst_argb]
@@ -4593,16 +4644,16 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
 
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
   ptrdiff_t temp_width = (ptrdiff_t)(width);
-  asm volatile("movdqa      %3,%%xmm5                     \n"
+      asm volatile("movdqa      %3,%%xmm5                     \n"
 
                LABELALIGN
-               "1:          \n"
-               "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
-               "pshufb      %%xmm5,%%xmm0                 \n"
-               "movdqu      %%xmm0,(%1)                   \n"
-               "lea         0x10(%1),%1                   \n"
-               "sub         $0x10,%2                      \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
                : "+r"(src),           // %0
                  "+r"(dst),           // %1
                  "+r"(temp_width)     // %2
@@ -4611,44 +4662,21 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 }
 #endif  // HAS_MIRRORROW_SSSE3
 
-#ifdef HAS_MIRRORROW_AVX512BW
-void MirrorRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) {
-  ptrdiff_t temp_width = (ptrdiff_t)(width);
-  asm volatile("vbroadcasti32x4 %3,%%zmm5                 \n"
-
-               LABELALIGN
-               "1:          \n"
-               "vmovdqu8    -0x40(%0,%2,1),%%zmm0         \n"
-               "vpshufb     %%zmm5,%%zmm0,%%zmm0          \n"
-               "vshufi64x2  $0x1b,%%zmm0,%%zmm0,%%zmm0    \n"
-               "vmovdqu8    %%zmm0,(%1)                   \n"
-               "lea         0x40(%1),%1                   \n"
-               "sub         $0x40,%2                      \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
-               : "+r"(src),           // %0
-                 "+r"(dst),           // %1
-                 "+r"(temp_width)     // %2
-               : "m"(kShuffleMirror)  // %3
-               : "memory", "cc", "zmm0", "zmm5");
-}
-#endif  // HAS_MIRRORROW_AVX512BW
-
 #ifdef HAS_MIRRORROW_AVX2
 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   ptrdiff_t temp_width = (ptrdiff_t)(width);
-  asm volatile("vbroadcasti128 %3,%%ymm5                  \n"
+      asm volatile("vbroadcasti128 %3,%%ymm5                  \n"
 
                LABELALIGN
-               "1:          \n"
-               "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
-               "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
-               "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
-               "vmovdqu     %%ymm0,(%1)                   \n"
-               "lea         0x20(%1),%1                   \n"
-               "sub         $0x20,%2                      \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "1:          \n"
+      "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : "+r"(src),           // %0
                  "+r"(dst),           // %1
                  "+r"(temp_width)     // %2
@@ -4657,82 +4685,6 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 }
 #endif  // HAS_MIRRORROW_AVX2
 
-#if defined(HAS_MIRRORSPLITUVROW_AVX2) || defined(HAS_MIRRORSPLITUVROW_AVX512BW)
-// Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
-                                            15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-#endif
-
-#ifdef HAS_MIRRORSPLITUVROW_AVX512BW
-static const uint64_t kMirrorSplitUVPermute[8] = {6, 4, 2, 0, 7, 5, 3, 1};
-
-void MirrorSplitUVRow_AVX512BW(const uint8_t* src,
-                               uint8_t* dst_u,
-                               uint8_t* dst_v,
-                               int width) {
-  ptrdiff_t temp_width = (ptrdiff_t)(width);
-  asm volatile(
-      "vbroadcasti32x4 %4,%%zmm1                 \n"
-      "lea         -0x40(%0,%3,2),%0             \n"
-      "sub         %1,%2                         \n"
-      "vmovdqu64   %5,%%zmm3                     \n"
-
-      LABELALIGN
-      "1:          \n"
-      "vmovdqu8    (%0),%%zmm0                   \n"
-      "lea         -0x40(%0),%0                  \n"
-      "vpshufb     %%zmm1,%%zmm0,%%zmm0          \n"
-      "vpermq      %%zmm0,%%zmm3,%%zmm0          \n"
-      "vextracti64x4 $0x1,%%zmm0,%%ymm2          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm2,0x00(%1,%2,1)          \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
-      : "+r"(src),                   // %0
-        "+r"(dst_u),                 // %1
-        "+r"(dst_v),                 // %2
-        "+r"(temp_width)             // %3
-      : "m"(kShuffleMirrorSplitUV),  // %4
-        "m"(kMirrorSplitUVPermute)   // %5
-      : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3");
-}
-#endif  // HAS_MIRRORSPLITUVROW_AVX512BW
-
-#ifdef HAS_MIRRORSPLITUVROW_AVX2
-void MirrorSplitUVRow_AVX2(const uint8_t* src,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width) {
-  ptrdiff_t temp_width = (ptrdiff_t)(width);
-  asm volatile(
-      "vbroadcasti128 %4,%%ymm1                  \n"
-      "lea         -0x20(%0,%3,2),%0             \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:          \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "lea         -0x20(%0),%0                  \n"
-      "vpshufb     %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0x72,%%ymm0,%%ymm0           \n"
-      "vextracti128 $0x1,%%ymm0,%%xmm2           \n"
-      "vmovdqu     %%xmm0,(%1)                   \n"
-      "vmovdqu     %%xmm2,0x00(%1,%2,1)          \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
-      : "+r"(src),                  // %0
-        "+r"(dst_u),                // %1
-        "+r"(dst_v),                // %2
-        "+r"(temp_width)            // %3
-      : "m"(kShuffleMirrorSplitUV)  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
-}
-#endif  // HAS_MIRRORSPLITUVROW_AVX2
-
 #ifdef HAS_MIRRORUVROW_SSSE3
 // Shuffle table for reversing the UV.
 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
@@ -4740,16 +4692,16 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
 
 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
   ptrdiff_t temp_width = (ptrdiff_t)(width);
-  asm volatile("movdqa      %3,%%xmm5                     \n"
+      asm volatile("movdqa      %3,%%xmm5                     \n"
 
                LABELALIGN
-               "1:          \n"
-               "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
-               "pshufb      %%xmm5,%%xmm0                 \n"
-               "movdqu      %%xmm0,(%1)                   \n"
-               "lea         0x10(%1),%1                   \n"
-               "sub         $0x8,%2                       \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
                : "+r"(src_uv),          // %0
                  "+r"(dst_uv),          // %1
                  "+r"(temp_width)       // %2
@@ -4761,18 +4713,18 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
 #ifdef HAS_MIRRORUVROW_AVX2
 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
   ptrdiff_t temp_width = (ptrdiff_t)(width);
-  asm volatile("vbroadcasti128 %3,%%ymm5                  \n"
+      asm volatile("vbroadcasti128 %3,%%ymm5                  \n"
 
                LABELALIGN
-               "1:          \n"
-               "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
-               "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
-               "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
-               "vmovdqu     %%ymm0,(%1)                   \n"
-               "lea         0x20(%1),%1                   \n"
-               "sub         $0x10,%2                      \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "1:          \n"
+      "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : "+r"(src_uv),          // %0
                  "+r"(dst_uv),          // %1
                  "+r"(temp_width)       // %2
@@ -4781,6 +4733,39 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
 }
 #endif  // HAS_MIRRORUVROW_AVX2
 
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+                                            15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width) {
+  ptrdiff_t temp_width = (ptrdiff_t)(width);
+  asm volatile(
+      "movdqa      %4,%%xmm1                     \n"
+      "lea         -0x10(%0,%3,2),%0             \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:          \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         -0x10(%0),%0                  \n"
+      "pshufb      %%xmm1,%%xmm0                 \n"
+      "movlpd      %%xmm0,(%1)                   \n"
+      "movhpd      %%xmm0,0x00(%1,%2,1)          \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $8,%3                         \n"
+      "jg          1b                            \n"
+      : "+r"(src),                  // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(temp_width)            // %3
+      : "m"(kShuffleMirrorSplitUV)  // %4
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_MIRRORSPLITUVROW_SSSE3
+
 #ifdef HAS_RGB24MIRRORROW_SSSE3
 
 // Shuffle first 5 pixels to last 5 mirrored.  first byte zero
@@ -4828,73 +4813,21 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
 }
 #endif  // HAS_RGB24MIRRORROW_SSSE3
 
-#ifdef HAS_RGB24MIRRORROW_AVX2
-// Shuffle first 10 pixels to last 10 mirrored.  first byte zero
-static const uvec8 kShuffleMirrorRGB0_AVX = {
-    128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u};
-
-// Shuffle last 2 pixels to first 2 mirrored.  last byte zero
-static const uvec8 kShuffleMirrorRGB1_AVX = {
-    13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
-
-void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
-                         uint8_t* dst_rgb24,
-                         int width) {
-  ptrdiff_t temp_width = (ptrdiff_t)(width);
-  src_rgb24 += width * 3 - 96;
-  asm volatile(
-      "vbroadcasti128 %3,%%ymm4                  \n"
-      "vmovdqa     %4,%%xmm5                     \n"
-
-      LABELALIGN
-      "1:          \n"
-      "vmovdqu     (%0),%%xmm0                   \n"  // first 10
-      "vinserti128 $1,15(%0),%%ymm0,%%ymm0       \n"
-      "vmovdqu     30(%0),%%xmm1                 \n"  // next 10
-      "vinserti128 $1,45(%0),%%ymm1,%%ymm1       \n"
-      "vmovdqu     60(%0),%%xmm2                 \n"  // next 10
-      "vinserti128 $1,75(%0),%%ymm2,%%ymm2       \n"
-      "vmovdqu     80(%0),%%xmm3                 \n"  // last 2 special
-      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
-      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
-      "vpshufb     %%ymm4,%%ymm2,%%ymm2          \n"
-      "vpshufb     %%xmm5,%%xmm3,%%xmm3          \n"
-      "lea         -0x60(%0),%0                  \n"
-      "vmovdqu     %%xmm0,80(%1)                 \n"
-      "vextracti128 $1,%%ymm0,65(%1)             \n"
-      "vmovdqu     %%xmm1,50(%1)                 \n"
-      "vextracti128 $1,%%ymm1,35(%1)             \n"
-      "vmovdqu     %%xmm2,20(%1)                 \n"
-      "vextracti128 $1,%%ymm2,5(%1)              \n"
-      "vmovq       %%xmm3,0(%1)                  \n"
-      "lea         0x60(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
-      : "+r"(src_rgb24),              // %0
-        "+r"(dst_rgb24),              // %1
-        "+r"(temp_width)              // %2
-      : "m"(kShuffleMirrorRGB0_AVX),  // %3
-        "m"(kShuffleMirrorRGB1_AVX)   // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_RGB24MIRRORROW_AVX2
-
 #ifdef HAS_ARGBMIRRORROW_SSE2
 
 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   ptrdiff_t temp_width = (ptrdiff_t)(width);
-  asm volatile("lea         -0x10(%0,%2,4),%0             \n"
+      asm volatile("lea         -0x10(%0,%2,4),%0             \n"
 
                LABELALIGN
-               "1:          \n"
-               "movdqu      (%0),%%xmm0                   \n"
-               "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
-               "lea         -0x10(%0),%0                  \n"
-               "movdqu      %%xmm0,(%1)                   \n"
-               "lea         0x10(%1),%1                   \n"
-               "sub         $0x4,%2                       \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
+      "lea         -0x10(%0),%0                  \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
                : "+r"(src),        // %0
                  "+r"(dst),        // %1
                  "+r"(temp_width)  // %2
@@ -4908,16 +4841,16 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   ptrdiff_t temp_width = (ptrdiff_t)(width);
-  asm volatile("vmovdqu     %3,%%ymm5                     \n"
+      asm volatile("vmovdqu     %3,%%ymm5                     \n"
 
                LABELALIGN
-               "1:          \n"
-               "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
-               "vmovdqu     %%ymm0,(%1)                   \n"
-               "lea         0x20(%1),%1                   \n"
-               "sub         $0x8,%2                       \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "1:          \n"
+      "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : "+r"(src),                    // %0
                  "+r"(dst),                    // %1
                  "+r"(temp_width)              // %2
@@ -4964,47 +4897,6 @@ void SplitUVRow_AVX2(const uint8_t* src_uv,
 }
 #endif  // HAS_SPLITUVROW_AVX2
 
-#ifdef HAS_SPLITUVROW_AVX512BW
-static const uint64_t kSplitUVPermute[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-
-void SplitUVRow_AVX512BW(const uint8_t* src_uv,
-                         uint8_t* dst_u,
-                         uint8_t* dst_v,
-                         int width) {
-  asm volatile(
-      "vpternlogd  $0xff,%%zmm5,%%zmm5,%%zmm5    \n"
-      "vpsrlw      $0x8,%%zmm5,%%zmm5            \n"
-      "vmovdqu64   %4,%%zmm4                     \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:          \n"
-      "vmovdqu8    (%0),%%zmm0                   \n"
-      "vmovdqu8    0x40(%0),%%zmm1               \n"
-      "lea         0x80(%0),%0                   \n"
-      "vpsrlw      $0x8,%%zmm0,%%zmm2            \n"
-      "vpsrlw      $0x8,%%zmm1,%%zmm3            \n"
-      "vpandd      %%zmm5,%%zmm0,%%zmm0          \n"
-      "vpandd      %%zmm5,%%zmm1,%%zmm1          \n"
-      "vpackuswb   %%zmm1,%%zmm0,%%zmm0          \n"
-      "vpackuswb   %%zmm3,%%zmm2,%%zmm2          \n"
-      "vpermq      %%zmm0,%%zmm4,%%zmm0          \n"
-      "vpermq      %%zmm2,%%zmm4,%%zmm2          \n"
-      "vmovdqu8    %%zmm0,(%1)                   \n"
-      "vmovdqu8    %%zmm2,0x00(%1,%2,1)          \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x40,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
-      : "+r"(src_uv),         // %0
-        "+r"(dst_u),          // %1
-        "+r"(dst_v),          // %2
-        "+r"(width)           // %3
-      : "m"(kSplitUVPermute)  // %4
-      : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5");
-}
-#endif  // HAS_SPLITUVROW_AVX512BW
-
 #ifdef HAS_SPLITUVROW_SSE2
 void SplitUVRow_SSE2(const uint8_t* src_uv,
                      uint8_t* dst_u,
@@ -5182,20 +5074,20 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u,
                          const uint8_t* src_v,
                          uint8_t* dst_uv,
                          int width) {
-  asm volatile("sub         %0,%1                         \n"
+      asm volatile("sub         %0,%1                         \n"
 
                LABELALIGN
-               "1:          \n"
-               "vpmovzxbw   (%0),%%zmm0                   \n"
-               "vpmovzxbw   0x00(%0,%1,1),%%zmm1          \n"
-               "lea         0x20(%0),%0                   \n"
-               "vpsllw      $0x8,%%zmm1,%%zmm1            \n"
-               "vporq       %%zmm0,%%zmm1,%%zmm2          \n"
-               "vmovdqu64   %%zmm2,(%2)                   \n"
-               "lea         0x40(%2),%2                   \n"
-               "sub         $0x20,%3                      \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "1:          \n"
+      "vpmovzxbw   (%0),%%zmm0                   \n"
+      "vpmovzxbw   0x00(%0,%1,1),%%zmm1          \n"
+      "lea         0x20(%0),%0                   \n"
+      "vpsllw      $0x8,%%zmm1,%%zmm1            \n"
+      "vporq       %%zmm0,%%zmm1,%%zmm2          \n"
+      "vmovdqu64   %%zmm2,(%2)                   \n"
+      "lea         0x40(%2),%2                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : "+r"(src_u),   // %0
                  "+r"(src_v),   // %1
                  "+r"(dst_uv),  // %2
@@ -5210,20 +5102,20 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile("sub         %0,%1                         \n"
+      asm volatile("sub         %0,%1                         \n"
 
                LABELALIGN
-               "1:          \n"
-               "vpmovzxbw   (%0),%%ymm0                   \n"
-               "vpmovzxbw   0x00(%0,%1,1),%%ymm1          \n"
-               "lea         0x10(%0),%0                   \n"
-               "vpsllw      $0x8,%%ymm1,%%ymm1            \n"
-               "vpor        %%ymm0,%%ymm1,%%ymm2          \n"
-               "vmovdqu     %%ymm2,(%2)                   \n"
-               "lea         0x20(%2),%2                   \n"
-               "sub         $0x10,%3                      \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "1:          \n"
+      "vpmovzxbw   (%0),%%ymm0                   \n"
+      "vpmovzxbw   0x00(%0,%1,1),%%ymm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "vpsllw      $0x8,%%ymm1,%%ymm1            \n"
+      "vpor        %%ymm0,%%ymm1,%%ymm2          \n"
+      "vmovdqu     %%ymm2,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : "+r"(src_u),   // %0
                  "+r"(src_v),   // %1
                  "+r"(dst_uv),  // %2
@@ -5238,21 +5130,21 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile("sub         %0,%1                         \n"
+      asm volatile("sub         %0,%1                         \n"
 
                LABELALIGN
-               "1:          \n"
-               "movdqu      (%0),%%xmm0                   \n"
-               "movdqu      0x00(%0,%1,1),%%xmm1          \n"
-               "lea         0x10(%0),%0                   \n"
-               "movdqa      %%xmm0,%%xmm2                 \n"
-               "punpcklbw   %%xmm1,%%xmm0                 \n"
-               "punpckhbw   %%xmm1,%%xmm2                 \n"
-               "movdqu      %%xmm0,(%2)                   \n"
-               "movdqu      %%xmm2,0x10(%2)               \n"
-               "lea         0x20(%2),%2                   \n"
-               "sub         $0x10,%3                      \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpcklbw   %%xmm1,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "movdqu      %%xmm2,0x10(%2)               \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
                : "+r"(src_u),   // %0
                  "+r"(src_v),   // %1
                  "+r"(dst_uv),  // %2
@@ -5487,24 +5379,24 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y,
                               uint8_t* dst_y,
                               int scale,
                               int width) {
-  asm volatile("vpbroadcastw %3,%%zmm2                    \n"
+      asm volatile("vpbroadcastw %3,%%zmm2                    \n"
 
                // 64 pixels per loop.
                LABELALIGN
-               "1:          \n"
-               "vmovups     (%0),%%zmm0                   \n"
-               "vmovups     0x40(%0),%%zmm1               \n"
-               "add         $0x80,%0                      \n"
-               "vpmulhuw    %%zmm2,%%zmm0,%%zmm0          \n"
-               "vpmulhuw    %%zmm2,%%zmm1,%%zmm1          \n"
-               "vpmovuswb   %%zmm0,%%ymm0                 \n"
-               "vpmovuswb   %%zmm1,%%ymm1                 \n"
-               "vmovups     %%ymm0,(%1)                   \n"
-               "vmovups     %%ymm1,0x20(%1)               \n"
-               "add         $0x40,%1                      \n"
-               "sub         $0x40,%2                      \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "1:          \n"
+      "vmovups     (%0),%%zmm0                   \n"
+      "vmovups     0x40(%0),%%zmm1               \n"
+      "add         $0x80,%0                      \n"
+      "vpmulhuw    %%zmm2,%%zmm0,%%zmm0          \n"
+      "vpmulhuw    %%zmm2,%%zmm1,%%zmm1          \n"
+      "vpmovuswb   %%zmm0,%%ymm0                 \n"
+      "vpmovuswb   %%zmm1,%%ymm1                 \n"
+      "vmovups     %%ymm0,(%1)                   \n"
+      "vmovups     %%ymm1,0x20(%1)               \n"
+      "add         $0x40,%1                      \n"
+      "sub         $0x40,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : "+r"(src_y),  // %0
                  "+r"(dst_y),  // %1
                  "+r"(width)   // %2
@@ -5554,24 +5446,24 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
                           int scale,
                           int width) {
   const int shift = __builtin_clz(scale) - 15;
-  asm volatile("vmovd       %3,%%xmm2                     \n"
+      asm volatile("vmovd       %3,%%xmm2                     \n"
 
                // 32 pixels per loop.
                LABELALIGN
-               "1:          \n"
-               "vmovdqu     (%0),%%ymm0                   \n"
-               "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-               "add         $0x20,%0                      \n"
-               "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
-               "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
-               "vpsrlw      %%xmm2,%%ymm0,%%ymm0          \n"
-               "vpsrlw      %%xmm2,%%ymm1,%%ymm1          \n"
-               "vmovdqu     %%ymm0,(%1)                   \n"
-               "vmovdqu     %%ymm1,0x20(%1)               \n"
-               "add         $0x40,%1                      \n"
-               "sub         $0x20,%2                      \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "1:          \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "add         $0x20,%0                      \n"
+      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
+      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpsrlw      %%xmm2,%%ymm0,%%ymm0          \n"
+      "vpsrlw      %%xmm2,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "add         $0x40,%1                      \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : "+r"(src_y),  // %0
                  "+r"(dst_y),  // %1
                  "+r"(width)   // %2
@@ -6352,7 +6244,7 @@ void MergeXR30Row_AVX2(const uint16_t* src_r,
 #if defined(__i386__)
       : "m"(shift)  // %5
 #else
-      : "rm"(shift)  // %5
+      : "rm"(shift)   // %5
 #endif
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
@@ -6688,7 +6580,7 @@ void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) {
 // Multiple of 1.
 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile("rep         movsb                         \n"
+      asm volatile("rep         movsb                         \n"
                : "+S"(src),       // %0
                  "+D"(dst),       // %1
                  "+c"(width_tmp)  // %2
@@ -6898,7 +6790,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width >> 2);
   const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
-  asm volatile("rep         stosl                         \n"
+      asm volatile("rep         stosl                         \n"
                : "+D"(dst),       // %0
                  "+c"(width_tmp)  // %1
                : "a"(v32)         // %2
@@ -6907,7 +6799,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
 
 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile("rep         stosb                         \n"
+      asm volatile("rep         stosb                         \n"
                : "+D"(dst),       // %0
                  "+c"(width_tmp)  // %1
                : "a"(v8)          // %2
@@ -6916,7 +6808,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
 
 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile("rep         stosl                         \n"
+      asm volatile("rep         stosl                         \n"
                : "+D"(dst_argb),  // %0
                  "+c"(width_tmp)  // %1
                : "a"(v32)         // %2
@@ -8077,28 +7969,28 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile("pxor        %%xmm5,%%xmm5                 \n"
+      asm volatile("pxor        %%xmm5,%%xmm5                 \n"
 
                // 4 pixel loop.
                LABELALIGN
-               "1:          \n"
-               "movdqu      (%0),%%xmm0                   \n"
-               "lea         0x10(%0),%0                   \n"
-               "movdqu      (%1),%%xmm2                   \n"
-               "lea         0x10(%1),%1                   \n"
-               "movdqu      %%xmm0,%%xmm1                 \n"
-               "movdqu      %%xmm2,%%xmm3                 \n"
-               "punpcklbw   %%xmm0,%%xmm0                 \n"
-               "punpckhbw   %%xmm1,%%xmm1                 \n"
-               "punpcklbw   %%xmm5,%%xmm2                 \n"
-               "punpckhbw   %%xmm5,%%xmm3                 \n"
-               "pmulhuw     %%xmm2,%%xmm0                 \n"
-               "pmulhuw     %%xmm3,%%xmm1                 \n"
-               "packuswb    %%xmm1,%%xmm0                 \n"
-               "movdqu      %%xmm0,(%2)                   \n"
-               "lea         0x10(%2),%2                   \n"
-               "sub         $0x4,%3                       \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         0x10(%0),%0                   \n"
+      "movdqu      (%1),%%xmm2                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "movdqu      %%xmm0,%%xmm1                 \n"
+      "movdqu      %%xmm2,%%xmm3                 \n"
+      "punpcklbw   %%xmm0,%%xmm0                 \n"
+      "punpckhbw   %%xmm1,%%xmm1                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpckhbw   %%xmm5,%%xmm3                 \n"
+      "pmulhuw     %%xmm2,%%xmm0                 \n"
+      "pmulhuw     %%xmm3,%%xmm1                 \n"
+      "packuswb    %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "sub         $0x4,%3                       \n"
+      "jg          1b                            \n"
                : "+r"(src_argb),   // %0
                  "+r"(src_argb1),  // %1
                  "+r"(dst_argb),   // %2
@@ -8114,27 +8006,27 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
                // 4 pixel loop.
                LABELALIGN
-               "1:          \n"
-               "vmovdqu     (%0),%%ymm1                   \n"
-               "lea         0x20(%0),%0                   \n"
-               "vmovdqu     (%1),%%ymm3                   \n"
-               "lea         0x20(%1),%1                   \n"
-               "vpunpcklbw  %%ymm1,%%ymm1,%%ymm0          \n"
-               "vpunpckhbw  %%ymm1,%%ymm1,%%ymm1          \n"
-               "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
-               "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
-               "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
-               "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
-               "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-               "vmovdqu     %%ymm0,(%2)                   \n"
-               "lea         0x20(%2),%2                   \n"
-               "sub         $0x8,%3                       \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "1:          \n"
+      "vmovdqu     (%0),%%ymm1                   \n"
+      "lea         0x20(%0),%0                   \n"
+      "vmovdqu     (%1),%%ymm3                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "vpunpcklbw  %%ymm1,%%ymm1,%%ymm0          \n"
+      "vpunpckhbw  %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
+      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "sub         $0x8,%3                       \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : "+r"(src_argb),   // %0
                  "+r"(src_argb1),  // %1
                  "+r"(dst_argb),   // %2
@@ -8796,6 +8688,87 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int width,
+                          int source_y_fraction) {
+  asm volatile(
+      "sub         %1,%0                         \n"
+      "cmp         $0x0,%3                       \n"
+      "je          100f                          \n"
+      "cmp         $0x80,%3                      \n"
+      "je          50f                           \n"
+
+      "movd        %3,%%xmm0                     \n"
+      "neg         %3                            \n"
+      "add         $0x100,%3                     \n"
+      "movd        %3,%%xmm5                     \n"
+      "punpcklbw   %%xmm0,%%xmm5                 \n"
+      "punpcklwd   %%xmm5,%%xmm5                 \n"
+      "pshufd      $0x0,%%xmm5,%%xmm5            \n"
+      "mov         $0x80808080,%%eax             \n"
+      "movd        %%eax,%%xmm4                  \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+
+      // General purpose row blend.
+      LABELALIGN
+      "1:          \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      0x00(%1,%4,1),%%xmm2          \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "psubb       %%xmm4,%%xmm0                 \n"
+      "psubb       %%xmm4,%%xmm1                 \n"
+      "movdqa      %%xmm5,%%xmm2                 \n"
+      "movdqa      %%xmm5,%%xmm3                 \n"
+      "pmaddubsw   %%xmm0,%%xmm2                 \n"
+      "pmaddubsw   %%xmm1,%%xmm3                 \n"
+      "paddw       %%xmm4,%%xmm2                 \n"
+      "paddw       %%xmm4,%%xmm3                 \n"
+      "psrlw       $0x8,%%xmm2                   \n"
+      "psrlw       $0x8,%%xmm3                   \n"
+      "packuswb    %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm2,0x00(%1,%0,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "jmp         99f                           \n"
+
+      // Blend 50 / 50.
+      LABELALIGN
+      "50:         \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      0x00(%1,%4,1),%%xmm1          \n"
+      "pavgb       %%xmm1,%%xmm0                 \n"
+      "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          50b                           \n"
+      "jmp         99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      LABELALIGN
+      "100:        \n"
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      %%xmm0,0x00(%1,%0,1)          \n"
+      "lea         0x10(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          100b                          \n"
+
+      "99:         \n"
+      : "+r"(dst_ptr),           // %0
+        "+r"(src_ptr),           // %1
+        "+rm"(width),            // %2
+        "+r"(source_y_fraction)  // %3
+      : "r"(src_stride)          // %4
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_INTERPOLATEROW_SSSE3
+
 #ifdef HAS_INTERPOLATEROW_AVX2
 // Bilinear filter 32x2 -> 32x1
 void InterpolateRow_AVX2(uint8_t* dst_ptr,
@@ -8874,107 +8847,26 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,
 }
 #endif  // HAS_INTERPOLATEROW_AVX2
 
-#ifdef HAS_INTERPOLATEROW_16_AVX2
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
-                            const uint16_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            int width,
-                            int source_y_fraction) {
-  asm volatile(
-      "sub         %1,%0                         \n"
-      "cmp         $0x0,%3                       \n"
-      "je          100f                          \n"
-      "cmp         $0x80,%3                      \n"
-      "je          50f                           \n"
-
-      "vmovd       %3,%%xmm0                     \n"
-      "neg         %3                            \n"
-      "add         $0x100,%3                     \n"
-      "vmovd       %3,%%xmm5                     \n"
-      "vpunpcklwd  %%xmm0,%%xmm5,%%xmm5          \n"
-      "vpbroadcastd %%xmm5,%%ymm5                \n"
-      "mov         $0x80008000,%%eax             \n"  // 0x80008000 used to bias
-                                                      // unsigned words to
-                                                      // signed range for
-                                                      // vpmaddwd.
-      "vmovd       %%eax,%%xmm4                  \n"
-      "vbroadcastss %%xmm4,%%ymm4                \n"
-      "mov         $8388736,%%eax                \n"  // 32768 * 256 + 128
-                                                      // rounding constant.
-      "vmovd       %%eax,%%xmm3                  \n"
-      "vbroadcastss %%xmm3,%%ymm3                \n"
-
-      LABELALIGN
-      "1:          \n"
-      "vmovdqu     (%1),%%ymm0                   \n"
-      "vmovdqu     (%1,%4,2),%%ymm1              \n"
-      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
-      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpsubw      %%ymm4,%%ymm2,%%ymm2          \n"
-      "vpsubw      %%ymm4,%%ymm0,%%ymm0          \n"
-      "vpmaddwd    %%ymm5,%%ymm2,%%ymm2          \n"
-      "vpmaddwd    %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpaddd      %%ymm3,%%ymm2,%%ymm2          \n"
-      "vpaddd      %%ymm3,%%ymm0,%%ymm0          \n"
-      "vpsrad      $0x8,%%ymm2,%%ymm2            \n"
-      "vpsrad      $0x8,%%ymm0,%%ymm0            \n"
-      "vpackusdw   %%ymm2,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "jmp         99f                           \n"
-
-      "50:         \n" LABELALIGN
-      "2:          \n"
-      "vmovdqu     (%1),%%ymm0                   \n"
-      "vpavgw      (%1,%4,2),%%ymm0,%%ymm0       \n"
-      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          2b                            \n"
-      "jmp         99f                           \n"
-
-      "100:        \n" LABELALIGN
-      "3:          \n"
-      "vmovdqu     (%1),%%ymm0                   \n"
-      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          3b                            \n"
-
-      "99:         \n"
-      "vzeroupper  \n"
-      : "+r"(dst_ptr),           // %0
-        "+r"(src_ptr),           // %1
-        "+r"(width),             // %2
-        "+r"(source_y_fraction)  // %3
-      : "r"(src_stride)          // %4
-      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
-}
-#endif  // HAS_INTERPOLATEROW_16_AVX2
-
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           const uint8_t* shuffler,
                           int width) {
-  asm volatile("movdqu      (%3),%%xmm5                   \n"
+      asm volatile("movdqu      (%3),%%xmm5                   \n"
 
                LABELALIGN
-               "1:          \n"
-               "movdqu      (%0),%%xmm0                   \n"
-               "movdqu      0x10(%0),%%xmm1               \n"
-               "lea         0x20(%0),%0                   \n"
-               "pshufb      %%xmm5,%%xmm0                 \n"
-               "pshufb      %%xmm5,%%xmm1                 \n"
-               "movdqu      %%xmm0,(%1)                   \n"
-               "movdqu      %%xmm1,0x10(%1)               \n"
-               "lea         0x20(%1),%1                   \n"
-               "sub         $0x8,%2                       \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
                : "+r"(src_argb),  // %0
                  "+r"(dst_argb),  // %1
                  "+r"(width)      // %2
@@ -8989,21 +8881,21 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
                          int width) {
-  asm volatile("vbroadcasti128 (%3),%%ymm5                \n"
+      asm volatile("vbroadcasti128 (%3),%%ymm5                \n"
 
                LABELALIGN
-               "1:          \n"
-               "vmovdqu     (%0),%%ymm0                   \n"
-               "vmovdqu     0x20(%0),%%ymm1               \n"
-               "lea         0x40(%0),%0                   \n"
-               "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
-               "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
-               "vmovdqu     %%ymm0,(%1)                   \n"
-               "vmovdqu     %%ymm1,0x20(%1)               \n"
-               "lea         0x40(%1),%1                   \n"
-               "sub         $0x10,%2                      \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "1:          \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : "+r"(src_argb),  // %0
                  "+r"(dst_argb),  // %1
                  "+r"(width)      // %2
@@ -9012,59 +8904,30 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
 
-#ifdef HAS_ARGBSHUFFLEROW_AVX512BW
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             const uint8_t* shuffler,
-                             int width) {
-  asm volatile("vbroadcasti32x4 (%3),%%zmm5               \n"
-
-               LABELALIGN
-               "1:          \n"
-               "vmovdqu8    (%0),%%zmm0                   \n"
-               "vmovdqu8    0x40(%0),%%zmm1               \n"
-               "lea         0x80(%0),%0                   \n"
-               "vpshufb     %%zmm5,%%zmm0,%%zmm0          \n"
-               "vpshufb     %%zmm5,%%zmm1,%%zmm1          \n"
-               "vmovdqu8    %%zmm0,(%1)                   \n"
-               "vmovdqu8    %%zmm1,0x40(%1)               \n"
-               "lea         0x80(%1),%1                   \n"
-               "sub         $0x20,%2                      \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
-               : "+r"(src_argb),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(width)      // %2
-               : "r"(shuffler)    // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm5");
-}
-#endif  // HAS_ARGBSHUFFLEROW_AVX512BW
-
 #ifdef HAS_I422TOYUY2ROW_SSE2
 void I422ToYUY2Row_SSE2(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
                         uint8_t* dst_yuy2,
                         int width) {
-  asm volatile("sub         %1,%2                         \n"
+      asm volatile("sub         %1,%2                         \n"
 
                LABELALIGN
-               "1:          \n"
-               "movq        (%1),%%xmm2                   \n"
-               "movq        0x00(%1,%2,1),%%xmm1          \n"
-               "add         $0x8,%1                       \n"
-               "punpcklbw   %%xmm1,%%xmm2                 \n"
-               "movdqu      (%0),%%xmm0                   \n"
-               "add         $0x10,%0                      \n"
-               "movdqa      %%xmm0,%%xmm1                 \n"
-               "punpcklbw   %%xmm2,%%xmm0                 \n"
-               "punpckhbw   %%xmm2,%%xmm1                 \n"
-               "movdqu      %%xmm0,(%3)                   \n"
-               "movdqu      %%xmm1,0x10(%3)               \n"
-               "lea         0x20(%3),%3                   \n"
-               "sub         $0x10,%4                      \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movq        (%1),%%xmm2                   \n"
+      "movq        0x00(%1,%2,1),%%xmm1          \n"
+      "add         $0x8,%1                       \n"
+      "punpcklbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "add         $0x10,%0                      \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "punpcklbw   %%xmm2,%%xmm0                 \n"
+      "punpckhbw   %%xmm2,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%3)                   \n"
+      "movdqu      %%xmm1,0x10(%3)               \n"
+      "lea         0x20(%3),%3                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
                : "+r"(src_y),     // %0
                  "+r"(src_u),     // %1
                  "+r"(src_v),     // %2
@@ -9081,24 +8944,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_uyvy,
                         int width) {
-  asm volatile("sub         %1,%2                         \n"
+      asm volatile("sub         %1,%2                         \n"
 
                LABELALIGN
-               "1:          \n"
-               "movq        (%1),%%xmm2                   \n"
-               "movq        0x00(%1,%2,1),%%xmm1          \n"
-               "add         $0x8,%1                       \n"
-               "punpcklbw   %%xmm1,%%xmm2                 \n"
-               "movdqu      (%0),%%xmm0                   \n"
-               "movdqa      %%xmm2,%%xmm1                 \n"
-               "add         $0x10,%0                      \n"
-               "punpcklbw   %%xmm0,%%xmm1                 \n"
-               "punpckhbw   %%xmm0,%%xmm2                 \n"
-               "movdqu      %%xmm1,(%3)                   \n"
-               "movdqu      %%xmm2,0x10(%3)               \n"
-               "lea         0x20(%3),%3                   \n"
-               "sub         $0x10,%4                      \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movq        (%1),%%xmm2                   \n"
+      "movq        0x00(%1,%2,1),%%xmm1          \n"
+      "add         $0x8,%1                       \n"
+      "punpcklbw   %%xmm1,%%xmm2                 \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqa      %%xmm2,%%xmm1                 \n"
+      "add         $0x10,%0                      \n"
+      "punpcklbw   %%xmm0,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm2                 \n"
+      "movdqu      %%xmm1,(%3)                   \n"
+      "movdqu      %%xmm2,0x10(%3)               \n"
+      "lea         0x20(%3),%3                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
                : "+r"(src_y),     // %0
                  "+r"(src_u),     // %1
                  "+r"(src_v),     // %2
@@ -9115,27 +8978,27 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_yuy2,
                         int width) {
-  asm volatile("sub         %1,%2                         \n"
+      asm volatile("sub         %1,%2                         \n"
 
                LABELALIGN
-               "1:          \n"
-               "vpmovzxbw   (%1),%%ymm1                   \n"
-               "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
-               "add         $0x10,%1                      \n"
-               "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
-               "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
-               "vmovdqu     (%0),%%ymm0                   \n"
-               "add         $0x20,%0                      \n"
-               "vpunpcklbw  %%ymm2,%%ymm0,%%ymm1          \n"
-               "vpunpckhbw  %%ymm2,%%ymm0,%%ymm2          \n"
-               "vextractf128 $0x0,%%ymm1,(%3)             \n"
-               "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
-               "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
-               "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
-               "lea         0x40(%3),%3                   \n"
-               "sub         $0x20,%4                      \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "1:          \n"
+      "vpmovzxbw   (%1),%%ymm1                   \n"
+      "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
+      "add         $0x10,%1                      \n"
+      "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
+      "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "add         $0x20,%0                      \n"
+      "vpunpcklbw  %%ymm2,%%ymm0,%%ymm1          \n"
+      "vpunpckhbw  %%ymm2,%%ymm0,%%ymm2          \n"
+      "vextractf128 $0x0,%%ymm1,(%3)             \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
+      "lea         0x40(%3),%3                   \n"
+      "sub         $0x20,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : "+r"(src_y),     // %0
                  "+r"(src_u),     // %1
                  "+r"(src_v),     // %2
@@ -9152,27 +9015,27 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_uyvy,
                         int width) {
-  asm volatile("sub         %1,%2                         \n"
+      asm volatile("sub         %1,%2                         \n"
 
                LABELALIGN
-               "1:          \n"
-               "vpmovzxbw   (%1),%%ymm1                   \n"
-               "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
-               "add         $0x10,%1                      \n"
-               "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
-               "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
-               "vmovdqu     (%0),%%ymm0                   \n"
-               "add         $0x20,%0                      \n"
-               "vpunpcklbw  %%ymm0,%%ymm2,%%ymm1          \n"
-               "vpunpckhbw  %%ymm0,%%ymm2,%%ymm2          \n"
-               "vextractf128 $0x0,%%ymm1,(%3)             \n"
-               "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
-               "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
-               "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
-               "lea         0x40(%3),%3                   \n"
-               "sub         $0x20,%4                      \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "1:          \n"
+      "vpmovzxbw   (%1),%%ymm1                   \n"
+      "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
+      "add         $0x10,%1                      \n"
+      "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
+      "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "add         $0x20,%0                      \n"
+      "vpunpcklbw  %%ymm0,%%ymm2,%%ymm1          \n"
+      "vpunpckhbw  %%ymm0,%%ymm2,%%ymm2          \n"
+      "vextractf128 $0x0,%%ymm1,(%3)             \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
+      "lea         0x40(%3),%3                   \n"
+      "sub         $0x20,%4                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : "+r"(src_y),     // %0
                  "+r"(src_u),     // %1
                  "+r"(src_v),     // %2
@@ -9188,47 +9051,47 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const float* poly,
                             int width) {
-  asm volatile("pxor        %%xmm3,%%xmm3                 \n"
+      asm volatile("pxor        %%xmm3,%%xmm3                 \n"
 
                // 2 pixel loop.
                LABELALIGN
-               "1:          \n"
-               "movq        (%0),%%xmm0                   \n"
-               "lea         0x8(%0),%0                    \n"
-               "punpcklbw   %%xmm3,%%xmm0                 \n"
-               "movdqa      %%xmm0,%%xmm4                 \n"
-               "punpcklwd   %%xmm3,%%xmm0                 \n"
-               "punpckhwd   %%xmm3,%%xmm4                 \n"
-               "cvtdq2ps    %%xmm0,%%xmm0                 \n"
-               "cvtdq2ps    %%xmm4,%%xmm4                 \n"
-               "movdqa      %%xmm0,%%xmm1                 \n"
-               "movdqa      %%xmm4,%%xmm5                 \n"
-               "mulps       0x10(%3),%%xmm0               \n"
-               "mulps       0x10(%3),%%xmm4               \n"
-               "addps       (%3),%%xmm0                   \n"
-               "addps       (%3),%%xmm4                   \n"
-               "movdqa      %%xmm1,%%xmm2                 \n"
-               "movdqa      %%xmm5,%%xmm6                 \n"
-               "mulps       %%xmm1,%%xmm2                 \n"
-               "mulps       %%xmm5,%%xmm6                 \n"
-               "mulps       %%xmm2,%%xmm1                 \n"
-               "mulps       %%xmm6,%%xmm5                 \n"
-               "mulps       0x20(%3),%%xmm2               \n"
-               "mulps       0x20(%3),%%xmm6               \n"
-               "mulps       0x30(%3),%%xmm1               \n"
-               "mulps       0x30(%3),%%xmm5               \n"
-               "addps       %%xmm2,%%xmm0                 \n"
-               "addps       %%xmm6,%%xmm4                 \n"
-               "addps       %%xmm1,%%xmm0                 \n"
-               "addps       %%xmm5,%%xmm4                 \n"
-               "cvttps2dq   %%xmm0,%%xmm0                 \n"
-               "cvttps2dq   %%xmm4,%%xmm4                 \n"
-               "packuswb    %%xmm4,%%xmm0                 \n"
-               "packuswb    %%xmm0,%%xmm0                 \n"
-               "movq        %%xmm0,(%1)                   \n"
-               "lea         0x8(%1),%1                    \n"
-               "sub         $0x2,%2                       \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movq        (%0),%%xmm0                   \n"
+      "lea         0x8(%0),%0                    \n"
+      "punpcklbw   %%xmm3,%%xmm0                 \n"
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "punpcklwd   %%xmm3,%%xmm0                 \n"
+      "punpckhwd   %%xmm3,%%xmm4                 \n"
+      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
+      "cvtdq2ps    %%xmm4,%%xmm4                 \n"
+      "movdqa      %%xmm0,%%xmm1                 \n"
+      "movdqa      %%xmm4,%%xmm5                 \n"
+      "mulps       0x10(%3),%%xmm0               \n"
+      "mulps       0x10(%3),%%xmm4               \n"
+      "addps       (%3),%%xmm0                   \n"
+      "addps       (%3),%%xmm4                   \n"
+      "movdqa      %%xmm1,%%xmm2                 \n"
+      "movdqa      %%xmm5,%%xmm6                 \n"
+      "mulps       %%xmm1,%%xmm2                 \n"
+      "mulps       %%xmm5,%%xmm6                 \n"
+      "mulps       %%xmm2,%%xmm1                 \n"
+      "mulps       %%xmm6,%%xmm5                 \n"
+      "mulps       0x20(%3),%%xmm2               \n"
+      "mulps       0x20(%3),%%xmm6               \n"
+      "mulps       0x30(%3),%%xmm1               \n"
+      "mulps       0x30(%3),%%xmm5               \n"
+      "addps       %%xmm2,%%xmm0                 \n"
+      "addps       %%xmm6,%%xmm4                 \n"
+      "addps       %%xmm1,%%xmm0                 \n"
+      "addps       %%xmm5,%%xmm4                 \n"
+      "cvttps2dq   %%xmm0,%%xmm0                 \n"
+      "cvttps2dq   %%xmm4,%%xmm4                 \n"
+      "packuswb    %%xmm4,%%xmm0                 \n"
+      "packuswb    %%xmm0,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x2,%2                       \n"
+      "jg          1b                            \n"
                : "+r"(src_argb),  // %0
                  "+r"(dst_argb),  // %1
                  "+r"(width)      // %2
@@ -9316,7 +9179,7 @@ void HalfFloatRow_AVX2(const uint16_t* src,
 #if defined(__x86_64__)
       : "x"(scale)  // %3
 #else
-      : "m"(scale)  // %3
+      : "m"(scale)    // %3
 #endif
       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
 }
@@ -9354,7 +9217,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
 #if defined(__x86_64__)
       : "x"(scale)  // %3
 #else
-      : "m"(scale)  // %3
+      : "m"(scale)    // %3
 #endif
       : "memory", "cc", "xmm2", "xmm3", "xmm4");
 }
@@ -9688,20 +9551,20 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
 
 // Convert UV plane of NV12 to VU of NV21.
 void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile("movdqu      %3,%%xmm5                     \n"
+      asm volatile("movdqu      %3,%%xmm5                     \n"
 
                LABELALIGN
-               "1:          \n"
-               "movdqu      (%0),%%xmm0                   \n"
-               "movdqu      0x10(%0),%%xmm1               \n"
-               "lea         0x20(%0),%0                   \n"
-               "pshufb      %%xmm5,%%xmm0                 \n"
-               "pshufb      %%xmm5,%%xmm1                 \n"
-               "movdqu      %%xmm0,(%1)                   \n"
-               "movdqu      %%xmm1,0x10(%1)               \n"
-               "lea         0x20(%1),%1                   \n"
-               "sub         $0x10,%2                      \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "movdqu      0x10(%0),%%xmm1               \n"
+      "lea         0x20(%0),%0                   \n"
+      "pshufb      %%xmm5,%%xmm0                 \n"
+      "pshufb      %%xmm5,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
                : "+r"(src_uv),        // %0
                  "+r"(dst_vu),        // %1
                  "+r"(width)          // %2
@@ -9712,21 +9575,21 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
 
 #ifdef HAS_SWAPUVROW_AVX2
 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile("vbroadcasti128 %3,%%ymm5                  \n"
+      asm volatile("vbroadcasti128 %3,%%ymm5                  \n"
 
                LABELALIGN
-               "1:          \n"
-               "vmovdqu     (%0),%%ymm0                   \n"
-               "vmovdqu     0x20(%0),%%ymm1               \n"
-               "lea         0x40(%0),%0                   \n"
-               "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
-               "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
-               "vmovdqu     %%ymm0,(%1)                   \n"
-               "vmovdqu     %%ymm1,0x20(%1)               \n"
-               "lea         0x40(%1),%1                   \n"
-               "sub         $0x20,%2                      \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "1:          \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
+      "lea         0x40(%0),%0                   \n"
+      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : "+r"(src_uv),        // %0
                  "+r"(dst_vu),        // %1
                  "+r"(width)          // %2
diff --git a/source/row_lasx.cc b/source/row_lasx.cc
index e0802c15e..94cb44ed1 100644
--- a/source/row_lasx.cc
+++ b/source/row_lasx.cc
@@ -2027,12 +2027,10 @@ struct ArgbConstants {
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
 static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
-                                                         128,
-                                                         0};
+                                                        128,
+                                                        0};
 
-static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
-                                                       128,
-                                                       0};
+static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
 
 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@@ -2041,19 +2039,19 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
 // Add 16.5 = 0x1080
 
 static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                         0x1080,
-                                                         0};
+                                                        0x1080,
+                                                        0};
 
 static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0},
-                                                       0x1080,
-                                                       0};
+                                                      0x1080,
+                                                      0};
 #endif  // ArgbConstants
 
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
-                           uint8_t* dst_y,
-                           int width,
-                           const struct ArgbConstants* c) {
+                                  uint8_t* dst_y,
+                                  int width,
+                                  const struct ArgbConstants* c) {
   int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
   asm volatile(
       "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
@@ -2218,14 +2216,18 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
       "xvst            $xr10, %1,    0             \n\t"
       "addi.d          %1,    %1,    32            \n\t"
       "bnez            %2,    1b                   \n\t"
-      : "+&r"(src_rgba),  // %0
-        "+&r"(dst_y),     // %1
-        "+&r"(width)      // %2
-      : "r"(c),           // %3
-        "r"(shuff)        // %4
+      : "+&r"(src_rgba),    // %0
+        "+&r"(dst_y),       // %1
+        "+&r"(width)        // %2
+      : "r"(c),  // %3
+        "r"(shuff)          // %4
       : "memory");
 }
 
+
+
+
+
 void ARGBToUVJRow_LASX(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
diff --git a/source/row_lsx.cc b/source/row_lsx.cc
index 3e6d5154c..41689578a 100644
--- a/source/row_lsx.cc
+++ b/source/row_lsx.cc
@@ -2812,12 +2812,10 @@ struct ArgbConstants {
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
 static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
-                                                         128,
-                                                         0};
+                                                        128,
+                                                        0};
 
-static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
-                                                       128,
-                                                       0};
+static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
 
 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@@ -2826,19 +2824,19 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
 // Add 16.5 = 0x1080
 
 static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                         0x1080,
-                                                         0};
+                                                        0x1080,
+                                                        0};
 
 static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0},
-                                                       0x1080,
-                                                       0};
+                                                      0x1080,
+                                                      0};
 #endif  // ArgbConstants
 
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c) {
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct ArgbConstants* c) {
   asm volatile(
       "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
       "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
@@ -2989,14 +2987,18 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
       "vst            $vr10, %1,    0             \n\t"
       "addi.d         %1,    %1,    16            \n\t"
       "bnez           %2,    1b                   \n\t"
-      : "+&r"(src_rgba),  // %0
-        "+&r"(dst_y),     // %1
-        "+&r"(width)      // %2
-      : "r"(c),           // %3
-        "r"(shuff)        // %4
+      : "+&r"(src_rgba),    // %0
+        "+&r"(dst_y),       // %1
+        "+&r"(width)        // %2
+      : "r"(c),  // %3
+        "r"(shuff)          // %4
       : "memory");
 }
 
+
+
+
+
 // undef for unified sources build
 #undef YUVTORGB_SETUP
 #undef READYUV422_D
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 08608005f..257398bbe 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/convert_from_argb.h"  // For ArgbConstants
 #include "libyuv/row.h"
+#include "libyuv/convert_from_argb.h"  // For ArgbConstants
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -272,7 +272,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
       "subs        %[width], %[width], #8        \n"  //
       YUVTORGB                                        //
           RGBTORGB8                                   //
-      STORERGBA                                       //
+              STORERGBA                               //
       "bgt         1b                            \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
         [src_u] "+r"(src_u),                               // %[src_u]
@@ -325,8 +325,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:          \n"  //
-      READYUV422 "subs        %[width], %[width], #8        \n" YUVTORGB
-          RGBTORGB8 ARGBTORGB565
+      READYUV422
+      "subs        %[width], %[width], #8        \n" YUVTORGB RGBTORGB8
+          ARGBTORGB565
       "vst1.8      {q2}, [%[dst_rgb565]]!        \n"  // store 8 pixels RGB565.
       "bgt         1b                            \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
@@ -1847,54 +1848,45 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
                                int width,
                                const struct ArgbConstants* c) {
   asm volatile(
-      "vld1.8      {d24}, [%4]                   \n"  // load kRGBToU
-      "vld1.8      {d25}, [%5]                   \n"  // load kRGBToV
-      "vld1.16     {d26[0]}, [%6]                \n"  // load kAddUV[0]
-      "vmovl.s8    q10, d24                      \n"  // U coeffs (8 shorts)
-      "vmovl.s8    q11, d25                      \n"  // V coeffs (8 shorts)
-      "vdup.16     q6, d26[0]                    \n"  // bias
+      "vld1.8      {d16}, [%4]                   \n"  // load kRGBToU
+      "vld1.8      {d17}, [%5]                   \n"  // load kRGBToV
+      "vld1.16     {d18[0]}, [%6]                \n"  // load kAddUV[0]
+      "vabs.s8     d16, d16                      \n"  // BU, GU, RU
+      "vabs.s8     d17, d17                      \n"  // BV, GV, RV
+      "vdup.8      d20, d16[0]                   \n"  // BU
+      "vdup.8      d21, d16[1]                   \n"  // GU
+      "vdup.8      d22, d16[2]                   \n"  // RU
+      "vdup.8      d23, d17[0]                   \n"  // BV
+      "vdup.8      d24, d17[1]                   \n"  // GV
+      "vdup.8      d25, d17[2]                   \n"  // RV
+      "vdup.16     q15, d18[0]                   \n"  // kAddUV
+
       "1:          \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
       "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d20                   \n"  // B * BU
+      "vmlsl.u8    q2, d1, d21                   \n"  // - G * GU
+      "vmlsl.u8    q2, d2, d22                   \n"  // - R * RU
 
-      "vmovl.u8    q4, d0                        \n"  // B
-      "vmovl.u8    q5, d1                        \n"  // G
-      "vmovl.u8    q7, d2                        \n"  // R
-      "vmovl.u8    q8, d3                        \n"  // A
+      "vmull.u8    q3, d2, d25                   \n"  // R * RV
+      "vmlsl.u8    q3, d1, d24                   \n"  // - G * GV
+      "vmlsl.u8    q3, d0, d23                   \n"  // - B * BV
 
-      "vdup.16     q12, d20[0]                   \n"
-      "vmul.s16    q2, q4, q12                   \n"  // U = B * U0
-      "vdup.16     q12, d20[1]                   \n"
-      "vmla.s16    q2, q5, q12                   \n"  // U += G * U1
-      "vdup.16     q12, d20[2]                   \n"
-      "vmla.s16    q2, q7, q12                   \n"  // U += R * U2
-      "vdup.16     q12, d20[3]                   \n"
-      "vmla.s16    q2, q8, q12                   \n"  // U += A * U3
-
-      "vdup.16     q12, d22[0]                   \n"
-      "vmul.s16    q3, q4, q12                   \n"  // V = B * V0
-      "vdup.16     q12, d22[1]                   \n"
-      "vmla.s16    q3, q5, q12                   \n"  // V += G * V1
-      "vdup.16     q12, d22[2]                   \n"
-      "vmla.s16    q3, q7, q12                   \n"  // V += R * V2
-      "vdup.16     q12, d22[3]                   \n"
-      "vmla.s16    q3, q8, q12                   \n"  // V += A * V3
-
-      "vsubhn.s16  d0, q6, q2                    \n"  // 128.0 - U
-      "vsubhn.s16  d1, q6, q3                    \n"  // 128.0 - V
+      "vaddhn.u16  d0, q2, q15                   \n"  // signed -> unsigned
+      "vaddhn.u16  d1, q3, q15                   \n"
 
       "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels U.
       "vst1.8      {d1}, [%2]!                   \n"  // store 8 pixels V.
       "bgt         1b                            \n"
-      : "+r"(src_argb),    // %0
-        "+r"(dst_u),       // %1
-        "+r"(dst_v),       // %2
-        "+r"(width)        // %3
-      : "r"(&c->kRGBToU),  // %4
-        "r"(&c->kRGBToV),  // %5
-        "r"(&c->kAddUV)    // %6
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q10", "q11", "q12");
+      : "+r"(src_argb),     // %0
+        "+r"(dst_u),        // %1
+        "+r"(dst_v),        // %2
+        "+r"(width)         // %3
+      : "r"(&c->kRGBToU),   // %4
+        "r"(&c->kRGBToV),   // %5
+        "r"(&c->kAddUV)     // %6
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
 }
 
 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
@@ -1911,6 +1903,7 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
   ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbJPEGConstants);
 }
 
+
 // clang-format off
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
 #define RGBTOUV(QB, QG, QR)                                                 \
@@ -1932,68 +1925,61 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
                             int width,
                             const struct ArgbConstants* c) {
   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile(
-      "vld1.8      {d24}, [%5]                   \n"  // load kRGBToU (8 bytes,
-                                                      // only 4 used)
-      "vld1.8      {d25}, [%6]                   \n"  // load kRGBToV
-      "vmovl.s8    q14, d24                      \n"  // U coeffs in d28
-      "vmovl.s8    q15, d25                      \n"  // V coeffs in d30
-      "vmov.u16    q11, #0x8000                  \n"  // 128.0 bias
+  asm volatile (
+      "vld1.8      {d18}, [%5]                   \n"  // load kRGBToU
+      "vld1.8      {d19}, [%6]                   \n"  // load kRGBToV
+      "vmovl.s8    q8, d18                       \n"  // U coeffs in q8 (d16, d17)
+      "vmovl.s8    q9, d19                       \n"  // V coeffs in q9 (d18, d19)
+      "vdup.16     q10, d16[0]                   \n"  // U0
+      "vdup.16     q11, d16[1]                   \n"  // U1
+      "vdup.16     q12, d16[2]                   \n"  // U2
+      "vdup.16     q13, d18[0]                   \n"  // V0
+      "vdup.16     q14, d18[1]                   \n"  // V1
+      "vdup.16     q15, d18[2]                   \n"  // V2
 
       "1:          \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
-                                                      // pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
       "subs        %4, %4, #16                   \n"  // 16 processed per loop.
       "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
       "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
       "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
-      "vpaddl.u8   q3, q3                        \n"  // A 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more pixels.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"
-      "vpadal.u8   q0, q4                        \n"  // B
-      "vpadal.u8   q1, q5                        \n"  // G
-      "vpadal.u8   q2, q6                        \n"  // R
-      "vpadal.u8   q3, q7                        \n"  // A
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ARGB pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ARGB pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
 
       "vrshr.u16   q0, q0, #2                    \n"  // average of 4
       "vrshr.u16   q1, q1, #2                    \n"
       "vrshr.u16   q2, q2, #2                    \n"
-      "vrshr.u16   q3, q3, #2                    \n"
 
-      "vdup.16     q12, d28[0]                   \n"
-      "vmul.s16    q8, q0, q12                   \n"  // U = B * U0
-      "vdup.16     q12, d28[1]                   \n"
-      "vmla.s16    q8, q1, q12                   \n"  // U += G * U1
-      "vdup.16     q12, d28[2]                   \n"
+      "vmov.u16    q3, #0x8000                   \n"  // 128.0
+
+      "vmul.s16    q8, q0, q10                   \n"  // U = B * U0
+      "vmla.s16    q8, q1, q11                   \n"  // U += G * U1
       "vmla.s16    q8, q2, q12                   \n"  // U += R * U2
-      "vdup.16     q12, d28[3]                   \n"
-      "vmla.s16    q8, q3, q12                   \n"  // U += A * U3
 
-      "vdup.16     q12, d30[0]                   \n"
-      "vmul.s16    q9, q0, q12                   \n"  // V = B * V0
-      "vdup.16     q12, d30[1]                   \n"
-      "vmla.s16    q9, q1, q12                   \n"  // V += G * V1
-      "vdup.16     q12, d30[2]                   \n"
-      "vmla.s16    q9, q2, q12                   \n"  // V += R * V2
-      "vdup.16     q12, d30[3]                   \n"
-      "vmla.s16    q9, q3, q12                   \n"  // V += A * V3
+      "vmul.s16    q9, q0, q13                   \n"  // V = B * V0
+      "vmla.s16    q9, q1, q14                   \n"  // V += G * V1
+      "vmla.s16    q9, q2, q15                   \n"  // V += R * V2
 
-      "vsubhn.s16  d0, q11, q8                   \n"  // 128.0 - U
-      "vsubhn.s16  d1, q11, q9                   \n"  // 128.0 - V
+      "vsubhn.s16  d0, q3, q8                    \n"  // 128.0 - U
+      "vsubhn.s16  d1, q3, q9                    \n"  // 128.0 - V
 
       "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
       "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
       "bgt         1b                            \n"
-      : "+r"(src_argb),    // %0
-        "+r"(src_argb_1),  // %1
-        "+r"(dst_u),       // %2
-        "+r"(dst_v),       // %3
-        "+r"(width)        // %4
-      : "r"(&c->kRGBToU),  // %5
-        "r"(&c->kRGBToV)   // %6
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q9", "q11", "q12", "q14", "q15");
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  : "r"(&c->kRGBToU),  // %5
+    "r"(&c->kRGBToV)   // %6
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
 }
 
 void ARGBToUVRow_NEON(const uint8_t* src_argb,
@@ -2226,8 +2212,44 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  ARGBToUVMatrixRow_NEON(src_bgra, src_stride_bgra, dst_u, dst_v, width,
-                         &kBgraI601Constants);
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_bgra
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8000                  \n"  // 128.0
+      "1:          \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 BGRA pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 BGRA pixels.
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      "vpaddl.u8   q3, q3                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more BGRA pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 BGRA pixels.
+      "vpadal.u8   q3, q7                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q1, q1, #2                    \n"  // average of 4
+      "vrshr.u16   q2, q2, #2                    \n"
+      "vrshr.u16   q3, q3, #2                    \n"
+
+    RGBTOUV(q3, q2, q1)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_stride_bgra),  // %1
+    "+r"(dst_u),     // %2-
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
 }
 
 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
@@ -2235,8 +2257,44 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_u, dst_v, width,
-                         &kAbgrI601Constants);
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_abgr
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8000                  \n"  // 128.0
+      "1:          \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ABGR pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ABGR pixels.
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ABGR pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ABGR pixels.
+      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q1, #2                    \n"
+      "vrshr.u16   q2, q2, #2                    \n"
+
+    RGBTOUV(q2, q1, q0)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_stride_abgr),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
 }
 
 void RGBAToUVRow_NEON(const uint8_t* src_rgba,
@@ -2244,8 +2302,44 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  ARGBToUVMatrixRow_NEON(src_rgba, src_stride_rgba, dst_u, dst_v, width,
-                         &kRgbaI601Constants);
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_rgba
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8000                  \n"  // 128.0
+      "1:          \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 RGBA pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 RGBA pixels.
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      "vpaddl.u8   q0, q1                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q2                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q3                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more RGBA pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 RGBA pixels.
+      "vpadal.u8   q0, q5                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q6                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q7                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q1, #2                    \n"
+      "vrshr.u16   q2, q2, #2                    \n"
+
+    RGBTOUV(q0, q1, q2)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_stride_rgba),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
 }
 
 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
@@ -2703,20 +2797,19 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
 
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
-                           uint8_t* dst_y,
-                           int width,
-                           const struct ArgbConstants* c) {
+                            uint8_t* dst_y,
+                            int width,
+                            const struct ArgbConstants* c) {
   asm volatile(
-      "vld1.8      {d24}, [%3]                   \n"  // load kRGBToY
-      "vld1.16     {d25[0]}, [%4]                \n"  // load kAddY[0]
-      "vdup.8      d20, d24[0]                   \n"  // B
-      "vdup.8      d21, d24[1]                   \n"  // G
-      "vdup.8      d22, d24[2]                   \n"  // R
-      "vdup.8      d23, d24[3]                   \n"  // A
-      "vdup.16     q12, d25[0]                   \n"  // bias
+      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
+      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
+      "vdup.8      d20, d16[0]                   \n"  // BY
+      "vdup.8      d21, d16[1]                   \n"  // GY
+      "vdup.8      d22, d16[2]                   \n"  // RY
+      "vdup.16     q12, d18[0]                   \n"  // AY
       "1:          \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 pixels
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of ARGB
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
       "subs        %1, %1, #16                   \n"  // 16 processed per loop.
       "vmull.u8    q8, d0, d20                   \n"  // B
       "vmull.u8    q9, d1, d20                   \n"
@@ -2724,8 +2817,6 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
       "vmlal.u8    q9, d3, d21                   \n"
       "vmlal.u8    q8, d4, d22                   \n"  // R
       "vmlal.u8    q9, d5, d22                   \n"
-      "vmlal.u8    q8, d6, d23                   \n"  // A
-      "vmlal.u8    q9, d7, d23                   \n"
       "vaddhn.u16  d0, q8, q12                   \n"  // 16 bit to 8 bit Y
       "vaddhn.u16  d1, q9, q12                   \n"
       "vst1.8      {d0, d1}, [%2]!               \n"  // store 16 pixels Y.
@@ -2735,8 +2826,8 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
         "+r"(dst_y)        // %2
       : "r"(&c->kRGBToY),  // %3
         "r"(&c->kAddY)     // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12",
-        "d24", "d25");
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+        "q12");
 }
 
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -2755,33 +2846,65 @@ void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
   ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
 }
 
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
+                                  uint8_t* dst_y,
+                                  int width,
+                                  const struct ArgbConstants* c) {
+  asm volatile(
+      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
+      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
+      "vdup.8      d20, d16[0]                   \n"  // BY
+      "vdup.8      d21, d16[1]                   \n"  // GY
+      "vdup.8      d22, d16[2]                   \n"  // RY
+      "vdup.16     q12, d18[0]                   \n"  // AY
+      "1:          \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of RGBA
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
+      "vmull.u8    q8, d2, d20                   \n"  // B
+      "vmull.u8    q9, d3, d20                   \n"
+      "vmlal.u8    q8, d4, d21                   \n"  // G
+      "vmlal.u8    q9, d5, d21                   \n"
+      "vmlal.u8    q8, d6, d22                   \n"  // R
+      "vmlal.u8    q9, d7, d22                   \n"
+      "vaddhn.u16  d0, q8, q12                   \n"  // 16 bit to 8 bit Y
+      "vaddhn.u16  d1, q9, q12                   \n"
+      "vst1.8      {d0, d1}, [%1]!               \n"  // store 16 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_rgba),    // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      : "r"(&c->kRGBToY),  // %3
+        "r"(&c->kAddY)     // %4
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+        "q12");
+}
+
 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  ARGBToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgbaI601Constants);
+  RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kArgbI601Constants);
 }
 
 void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
-  ARGBToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
+  RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kArgbJPEGConstants);
 }
 
 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  ARGBToYMatrixRow_NEON(src_bgra, dst_y, width, &kBgraI601Constants);
-}
-
-void BGRAToYJRow_NEON(const uint8_t* src_bgra, uint8_t* dst_yj, int width) {
-  ARGBToYMatrixRow_NEON(src_bgra, dst_yj, width, &kBgraJPEGConstants);
+  RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kAbgrI601Constants);
 }
 
 void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c) {
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct ArgbConstants* c) {
   asm volatile(
-      "vld1.8      {d24}, [%3]                   \n"  // load kRGBToY
-      "vld1.16     {d25[0]}, [%4]                \n"  // load kAddY[0]
-      "vdup.8      d20, d24[0]                   \n"  // BY
-      "vdup.8      d21, d24[1]                   \n"  // GY
-      "vdup.8      d22, d24[2]                   \n"  // RY
-      "vdup.16     q12, d25[0]                   \n"  // AY
+      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
+      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
+      "vdup.8      d20, d16[0]                   \n"  // BY
+      "vdup.8      d21, d16[1]                   \n"  // GY
+      "vdup.8      d22, d16[2]                   \n"  // RY
+      "vdup.16     q12, d18[0]                   \n"  // AY
       "1:          \n"
       "vld3.8      {d2, d4, d6}, [%0]!           \n"  // load 16 pixels of
                                                       // RGB24.
@@ -2802,10 +2925,14 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
         "+r"(width)        // %2
       : "r"(&c->kRGBToY),  // %3
         "r"(&c->kAddY)     // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12",
-        "d24", "d25");
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+        "q12");
 }
 
+
+
+
+
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_NEON(uint8_t* dst_ptr,
                          const uint8_t* src_ptr,
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index f90b4a18b..19016cc3b 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/convert_from_argb.h"
 #include "libyuv/row.h"
+#include "libyuv/convert_from_argb.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -292,12 +292,12 @@ void I210ToAR30Row_NEON(const uint16_t* src_y,
   uint16_t limit = 0x3ff0;
   uint16_t alpha = 0xc000;
   asm volatile(YUVTORGB_SETUP
-               "dup         v22.8h, %w[limit]             \n"
-               "dup         v23.8h, %w[alpha]             \n"
-               "1:          \n"  //
+      "dup         v22.8h, %w[limit]             \n"
+      "dup         v23.8h, %w[alpha]             \n"
+      "1:          \n"  //
                READYUV210
-               "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
-               "b.gt        1b                            \n"
+      "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
+      "b.gt        1b                            \n"
                : [src_y] "+r"(src_y),             // %[src_y]
                  [src_u] "+r"(src_u),             // %[src_u]
                  [src_v] "+r"(src_v),             // %[src_v]
@@ -321,12 +321,12 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
   uint16_t limit = 0x3ff0;
   uint16_t alpha = 0xc000;
   asm volatile(YUVTORGB_SETUP
-               "dup         v22.8h, %w[limit]             \n"
-               "dup         v23.8h, %w[alpha]             \n"
-               "1:          \n"  //
+      "dup         v22.8h, %w[limit]             \n"
+      "dup         v23.8h, %w[alpha]             \n"
+      "1:          \n"  //
                READYUV410
-               "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
-               "b.gt        1b                            \n"
+      "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
+      "b.gt        1b                            \n"
                : [src_y] "+r"(src_y),             // %[src_y]
                  [src_u] "+r"(src_u),             // %[src_u]
                  [src_v] "+r"(src_v),             // %[src_v]
@@ -349,12 +349,12 @@ void I212ToAR30Row_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   const uint16_t limit = 0x3ff0;
   asm volatile(YUVTORGB_SETUP
-               "dup         v22.8h, %w[limit]             \n"
-               "movi        v23.8h, #0xc0, lsl #8         \n"  // A
-               "1:          \n"                                //
+      "dup         v22.8h, %w[limit]             \n"
+      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
+      "1:          \n"                                //
                READYUV212
-               "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
-               "b.gt        1b                            \n"
+      "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
+      "b.gt        1b                            \n"
                : [src_y] "+r"(src_y),             // %[src_y]
                  [src_u] "+r"(src_u),             // %[src_u]
                  [src_v] "+r"(src_v),             // %[src_v]
@@ -531,13 +531,13 @@ void P210ToAR30Row_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   const uint16_t limit = 0x3ff0;
   asm volatile(YUVTORGB_SETUP
-               "dup         v22.8h, %w[limit]             \n"
-               "movi        v23.8h, #0xc0, lsl #8         \n"  // A
-               "ldr         q2, [%[kIndices]]             \n"
-               "1:          \n"  //
+      "dup         v22.8h, %w[limit]             \n"
+      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
+      "ldr         q2, [%[kIndices]]             \n"
+      "1:          \n"  //
                READYUVP210
-               "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
-               "b.gt        1b                            \n"
+      "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
+      "b.gt        1b                            \n"
                : [src_y] "+r"(src_y),                     // %[src_y]
                  [src_uv] "+r"(src_uv),                   // %[src_uv]
                  [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
@@ -558,13 +558,13 @@ void P410ToAR30Row_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   uint16_t limit = 0x3ff0;
   asm volatile(YUVTORGB_SETUP
-               "dup         v22.8h, %w[limit]             \n"
-               "movi        v23.8h, #0xc0, lsl #8         \n"  // A
-               "ldr         q2, [%[kIndices]]             \n"
-               "1:          \n"  //
+      "dup         v22.8h, %w[limit]             \n"
+      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
+      "ldr         q2, [%[kIndices]]             \n"
+      "1:          \n"  //
                READYUVP410
-               "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
-               "b.gt        1b                            \n"
+      "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
+      "b.gt        1b                            \n"
                : [src_y] "+r"(src_y),                     // %[src_y]
                  [src_uv] "+r"(src_uv),                   // %[src_uv]
                  [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
@@ -783,8 +783,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "1:          \n"  //
-      READYUV422 "subs        %w[width], %w[width], #8      \n" I4XXTORGB
-          RGBTORGB8_TOP ARGBTORGB565_FROM_TOP
+      READYUV422
+      "subs        %w[width], %w[width], #8      \n" I4XXTORGB RGBTORGB8_TOP
+          ARGBTORGB565_FROM_TOP
       "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8 pixels RGB565.
       "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
@@ -1035,8 +1036,9 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
       YUVTORGB_SETUP
       "ldr         q2, [%[kNV12Table]]           \n"
       "1:          \n"  //
-      READNV12 "subs        %w[width], %w[width], #8      \n" NVTORGB
-          RGBTORGB8_TOP ARGBTORGB565_FROM_TOP
+      READNV12
+      "subs        %w[width], %w[width], #8      \n" NVTORGB RGBTORGB8_TOP
+          ARGBTORGB565_FROM_TOP
       "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8
                                                        // pixels
                                                        // RGB565.
@@ -2734,75 +2736,58 @@ struct RgbUVConstants {
 };
 
 // 8x1 pixels.
-void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
-                               uint8_t* dst_u,
-                               uint8_t* dst_v,
-                               int width,
-                               const struct ArgbConstants* c) {
+static void ARGBToUV444MatrixRow_NEON(
+    const uint8_t* src_argb,
+    uint8_t* dst_u,
+    uint8_t* dst_v,
+    int width,
+    const struct RgbUVConstants* rgbuvconstants) {
   asm volatile(
-      "ldr         q16, [%[c], #16]               \n"  // kRGBToU
-      "ldr         q17, [%[c], #32]               \n"  // kRGBToV
-      "ldr         s0, [%[c], #64]                \n"  // kAddUV
-      "sxtl        v16.8h, v16.8b                 \n"  // sign extend U coeffs
-                                                       // to 16-bit
-      "sxtl        v17.8h, v17.8b                 \n"  // sign extend V coeffs
-                                                       // to 16-bit
-      "dup         v20.8h, v16.h[0]               \n"  // U0
-      "dup         v21.8h, v16.h[1]               \n"  // U1
-      "dup         v22.8h, v16.h[2]               \n"  // U2
-      "dup         v23.8h, v16.h[3]               \n"  // U3
-      "dup         v24.8h, v17.h[0]               \n"  // V0
-      "dup         v26.8h, v17.h[1]               \n"  // V1
-      "dup         v27.8h, v17.h[2]               \n"  // V2
-      "dup         v28.8h, v17.h[3]               \n"  // V3
-      "dup         v25.8h, v0.h[0]                \n"  // kAddUV
+      "ldr         d0, [%4]                      \n"  // load rgbuvconstants
+      "dup         v24.16b, v0.b[0]              \n"  // UB  0.875 coefficient
+      "dup         v25.16b, v0.b[1]              \n"  // UG -0.5781 coefficient
+      "dup         v26.16b, v0.b[2]              \n"  // UR -0.2969 coefficient
+      "dup         v27.16b, v0.b[4]              \n"  // VB -0.1406 coefficient
+      "dup         v28.16b, v0.b[5]              \n"  // VG -0.7344 coefficient
+      "neg         v24.16b, v24.16b              \n"
+      "movi        v29.8h, #0x80, lsl #8         \n"  // 128.0
+
       "1:          \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
+      "umull       v4.8h, v0.8b, v24.8b          \n"  // B
+      "umlsl       v4.8h, v1.8b, v25.8b          \n"  // G
+      "umlsl       v4.8h, v2.8b, v26.8b          \n"  // R
+      "prfm        pldl1keep, [%0, 448]          \n"
 
-      "uxtl        v4.8h, v0.8b                  \n"
-      "uxtl        v5.8h, v1.8b                  \n"
-      "uxtl        v6.8h, v2.8b                  \n"
-      "uxtl        v7.8h, v3.8b                  \n"
+      "umull       v3.8h, v2.8b, v24.8b          \n"  // R
+      "umlsl       v3.8h, v1.8b, v28.8b          \n"  // G
+      "umlsl       v3.8h, v0.8b, v27.8b          \n"  // B
 
-      // U = B*U0 + G*U1 + R*U2 + A*U3
-      "mul         v18.8h, v4.8h, v20.8h         \n"
-      "mla         v18.8h, v5.8h, v21.8h         \n"
-      "mla         v18.8h, v6.8h, v22.8h         \n"
-      "mla         v18.8h, v7.8h, v23.8h         \n"
+      "addhn       v0.8b, v4.8h, v29.8h          \n"  // signed -> unsigned
+      "addhn       v1.8b, v3.8h, v29.8h          \n"
 
-      // V = B*V0 + G*V1 + R*V2 + A*V3
-      "mul         v19.8h, v4.8h, v24.8h         \n"
-      "mla         v19.8h, v5.8h, v26.8h         \n"
-      "mla         v19.8h, v6.8h, v27.8h         \n"
-      "mla         v19.8h, v7.8h, v28.8h         \n"
-
-      "subhn       v0.8b, v25.8h, v18.8h         \n"
-      "subhn       v1.8b, v25.8h, v19.8h         \n"
-
-      "st1         {v0.8b}, [%1], #8             \n"
-      "st1         {v1.8b}, [%2], #8             \n"
+      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%2], #8             \n"  // store 8 pixels V.
       "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-        "+r"(width)      // %3
-      : [c] "r"(c)       // %4
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-        "v27", "v28");
+      : "+r"(src_argb),      // %0
+        "+r"(dst_u),         // %1
+        "+r"(dst_v),         // %2
+        "+r"(width)          // %3
+      : "r"(rgbuvconstants)  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
+        "v27", "v28", "v29");
 }
 
-static void ARGBToUV444MatrixRow_NEON_I8MM(const uint8_t* src_argb,
-                                           uint8_t* dst_u,
-                                           uint8_t* dst_v,
-                                           int width,
-                                           const struct ArgbConstants* c) {
+static void ARGBToUV444MatrixRow_NEON_I8MM(
+    const uint8_t* src_argb,
+    uint8_t* dst_u,
+    uint8_t* dst_v,
+    int width,
+    const struct RgbUVConstants* rgbuvconstants) {
   asm volatile(
-      "ldr         q16, [%[c], #16]              \n"  // kRGBToU
-      "ldr         q17, [%[c], #32]              \n"  // kRGBToV
-      "ldr         s0, [%[c], #64]               \n"  // kAddUV
-      "dup         v29.8h, v0.h[0]               \n"  // 128.0
+      "ld2r        {v16.4s, v17.4s}, [%[rgbuvconstants]] \n"
+      "movi        v29.8h, #0x80, lsl #8         \n"  // 128.0
       "1:          \n"
       "ldp         q0, q1, [%[src]], #32         \n"
       "subs        %w[width], %w[width], #8      \n"  // 8 processed per loop.
@@ -2822,11 +2807,11 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(const uint8_t* src_argb,
       "str         d0, [%[dst_u]], #8            \n"  // store 8 pixels U.
       "str         d1, [%[dst_v]], #8            \n"  // store 8 pixels V.
       "b.gt        1b                            \n"
-      : [src] "+r"(src_argb),  // %[src]
-        [dst_u] "+r"(dst_u),   // %[dst_u]
-        [dst_v] "+r"(dst_v),   // %[dst_v]
-        [width] "+r"(width)    // %[width]
-      : [c] "r"(c)             // %[c]
+      : [src] "+r"(src_argb),                 // %[src]
+        [dst_u] "+r"(dst_u),                  // %[dst_u]
+        [dst_v] "+r"(dst_v),                  // %[dst_v]
+        [width] "+r"(width)                   // %[width]
+      : [rgbuvconstants] "r"(rgbuvconstants)  // %[rgbuvconstants]
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17",
         "v29");
 }
@@ -2839,11 +2824,15 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(const uint8_t* src_argb,
 // VG -0.7344 coefficient = -94
 // VR   0.875 coefficient = 112
 
+static const struct RgbUVConstants kARGBI601UVConstants = {{-112, 74, 38, 0},
+                                                           {18, 94, -112, 0}};
+
 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbI601Constants);
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kARGBI601UVConstants);
 }
 
 void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
@@ -2851,14 +2840,26 @@ void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
                               uint8_t* dst_v,
                               int width) {
   ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
-                                 &kArgbI601Constants);
+                                 &kARGBI601UVConstants);
 }
 
+// RGB to JPEG coefficients
+// UB  0.500    coefficient = 128
+// UG -0.33126  coefficient = -85
+// UR -0.16874  coefficient = -43
+// VB -0.08131  coefficient = -21
+// VG -0.41869  coefficient = -107
+// VR 0.500     coefficient = 128
+
+static const struct RgbUVConstants kARGBJPEGUVConstants = {{-128, 85, 43, 0},
+                                                           {21, 107, -128, 0}};
+
 void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbJPEGConstants);
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+                            &kARGBJPEGUVConstants);
 }
 
 void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
@@ -2866,7 +2867,7 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
                                uint8_t* dst_v,
                                int width) {
   ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
-                                 &kArgbJPEGConstants);
+                                 &kARGBJPEGUVConstants);
 }
 
 #define RGBTOUV_SETUP_REG                                                  \
@@ -2900,75 +2901,63 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
                             int width,
                             const struct ArgbConstants* c) {
   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile(
-      "ldr         q16, [%[c], #16]               \n"  // kRGBToU
-      "ldr         q17, [%[c], #32]               \n"  // kRGBToV
-      "sxtl        v16.8h, v16.8b                 \n"  // sign extend U coeffs
-                                                       // to 16-bit
-      "sxtl        v17.8h, v17.8b                 \n"  // sign extend V coeffs
-                                                       // to 16-bit
-      "dup         v20.8h, v16.h[0]               \n"  // U0
-      "dup         v21.8h, v16.h[1]               \n"  // U1
-      "dup         v22.8h, v16.h[2]               \n"  // U2
-      "dup         v23.8h, v16.h[3]               \n"  // U3
-      "dup         v24.8h, v17.h[0]               \n"  // V0
-      "dup         v26.8h, v17.h[1]               \n"  // V1
-      "dup         v27.8h, v17.h[2]               \n"  // V2
-      "dup         v28.8h, v17.h[3]               \n"  // V3
-      "movi        v25.8h, #0x80, lsl #8          \n"  // 128.0 in 16-bit
-                                                       // (0x8000)
+  asm volatile (
+      "ldr        q16, [%[c], #16]               \n" // kRGBToU
+      "ldr        q17, [%[c], #32]               \n" // kRGBToV
+      "sxtl       v16.8h, v16.8b                 \n" // sign extend U coeffs to 16-bit
+      "sxtl       v17.8h, v17.8b                 \n" // sign extend V coeffs to 16-bit
+      "dup        v20.8h, v16.h[0]               \n" // U0 (-BU)
+      "dup        v21.8h, v16.h[1]               \n" // U1 (-GU)
+      "dup        v22.8h, v16.h[2]               \n" // U2 (-RU)
+      "dup        v23.8h, v17.h[0]               \n" // V0 (-BV)
+      "dup        v24.8h, v17.h[1]               \n" // V1 (-GV)
+      "dup        v26.8h, v17.h[2]               \n" // V2 (-RV)
+      "movi       v25.8h, #0x80, lsl #8          \n" // 128.0 in 16-bit (0x8000)
 
       "1:          \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
-                                                                 // pixels.
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
       "prfm        pldl1keep, [%0, 448]          \n"
       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "uaddlp      v18.8h, v3.16b                \n"  // A 16 bytes -> 8 shorts.
 
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16
-                                                                 // more.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
       "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
       "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "uadalp      v18.8h, v7.16b                \n"  // A 16 bytes -> 8 shorts.
 
       "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
       "urshr       v1.8h, v1.8h, #2              \n"
       "urshr       v2.8h, v2.8h, #2              \n"
-      "urshr       v18.8h, v18.8h, #2             \n"
 
-      // U = B*U0 + G*U1 + R*U2 + A*U3
-      "mul         v3.8h, v0.8h, v20.8h          \n"
-      "mla         v3.8h, v1.8h, v21.8h          \n"
-      "mla         v3.8h, v2.8h, v22.8h          \n"
-      "mla         v3.8h, v18.8h, v23.8h         \n"
+      // U = B*U0 + G*U1 + R*U2
+      "mul        v3.8h, v0.8h, v20.8h          \n"
+      "mla        v3.8h, v1.8h, v21.8h          \n"
+      "mla        v3.8h, v2.8h, v22.8h          \n"
 
-      // V = B*V0 + G*V1 + R*V2 + A*V3
-      "mul         v4.8h, v0.8h, v24.8h          \n"
-      "mla         v4.8h, v1.8h, v26.8h          \n"
-      "mla         v4.8h, v2.8h, v27.8h          \n"
-      "mla         v4.8h, v18.8h, v28.8h         \n"
+      // V = B*V0 + G*V1 + R*V2
+      "mul        v4.8h, v0.8h, v23.8h          \n"
+      "mla        v4.8h, v1.8h, v24.8h          \n"
+      "mla        v4.8h, v2.8h, v26.8h          \n"
 
       // U = (128.0 - U) >> 8, V = (128.0 - V) >> 8
-      "subhn       v0.8b, v25.8h, v3.8h           \n"
-      "subhn       v1.8b, v25.8h, v4.8h           \n"
+      "subhn      v0.8b, v25.8h, v3.8h           \n"
+      "subhn      v1.8b, v25.8h, v4.8h           \n"
 
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
       "b.gt        1b                            \n"
-      : "+r"(src_argb),    // %0
-        "+r"(src_argb_1),  // %1
-        "+r"(dst_u),       // %2
-        "+r"(dst_v),       // %3
-        "+r"(width)        // %4
-      : [c] "r"(c)         // %5
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v17", "v18", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
-        "v28");
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  : [c] "r"(c)         // %5
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26"
+  );
 }
 
 void ARGBToUVRow_NEON(const uint8_t* src_argb,
@@ -2985,35 +2974,44 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width,
-                         &kArgbJPEGConstants);
-}
+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+      "movi        v20.8h, #128                  \n"  // UB/VR coeff (0.500)
+      "movi        v21.8h, #85                   \n"  // UG coeff (-0.33126)
+      "movi        v22.8h, #43                   \n"  // UR coeff (-0.16874)
+      "movi        v23.8h, #21                   \n"  // VB coeff (-0.08131)
+      "movi        v24.8h, #107                  \n"  // VG coeff (-0.41869)
+      "movi        v25.8h, #0x80, lsl #8         \n"  // 128.0 (0x8000 in 16-bit)
+      "1:          \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
 
-void ABGRToUVRow_NEON(const uint8_t* src_abgr,
-                      int src_stride_abgr,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_u, dst_v, width,
-                         &kAbgrI601Constants);
-}
+      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
+      "urshr       v1.8h, v1.8h, #2              \n"
+      "urshr       v2.8h, v2.8h, #2              \n"
 
-void BGRAToUVRow_NEON(const uint8_t* src_bgra,
-                      int src_stride_bgra,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  ARGBToUVMatrixRow_NEON(src_bgra, src_stride_bgra, dst_u, dst_v, width,
-                         &kBgraI601Constants);
-}
-
-void RGBAToUVRow_NEON(const uint8_t* src_rgba,
-                      int src_stride_rgba,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  ARGBToUVMatrixRow_NEON(src_rgba, src_stride_rgba, dst_u, dst_v, width,
-                         &kRgbaI601Constants);
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
 }
 
 void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
@@ -3021,8 +3019,44 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
                        uint8_t* dst_uj,
                        uint8_t* dst_vj,
                        int width) {
-  ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_uj, dst_vj, width,
-                         &kAbgrJPEGConstants);
+  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
+  asm volatile (
+      "movi        v20.8h, #128                  \n"  // UB/VR coeff (0.500)
+      "movi        v21.8h, #85                   \n"  // UG coeff (-0.33126)
+      "movi        v22.8h, #43                   \n"  // UR coeff (-0.16874)
+      "movi        v23.8h, #21                   \n"  // VB coeff (-0.08131)
+      "movi        v24.8h, #107                  \n"  // VG coeff (-0.41869)
+      "movi        v25.8h, #0x80, lsl #8         \n"  // 128.0 (0x8000 in 16-bit)
+      "1:          \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+      "uaddlp      v0.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
+      "urshr       v1.8h, v1.8h, #2              \n"
+      "urshr       v2.8h, v2.8h, #2              \n"
+
+    RGBTOUV(v2.8h, v1.8h, v0.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_abgr_1),  // %1
+    "+r"(dst_uj),     // %2
+    "+r"(dst_vj),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
 }
 
 void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
@@ -3115,6 +3149,126 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
   );
 }
 
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+      "1:          \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+      "uaddlp      v0.8h, v3.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v3.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v1.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
+      "uadalp      v0.8h, v7.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v3.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v5.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
+      "urshr       v1.8h, v3.8h, #2              \n"
+      "urshr       v2.8h, v2.8h, #2              \n"
+
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_bgra_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+      "1:          \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+      "uaddlp      v3.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v2.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v1.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+      "uadalp      v3.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v2.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v1.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v3.8h, #2              \n"  // average of 4
+      "urshr       v2.8h, v2.8h, #2              \n"
+      "urshr       v1.8h, v1.8h, #2              \n"
+
+    RGBTOUV(v0.8h, v2.8h, v1.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_abgr_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+      "1:          \n"
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
+      "uaddlp      v0.8h, v1.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v1.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v2.8h, v3.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+      "uadalp      v0.8h, v5.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v1.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v2.8h, v7.16b                 \n"  // R 16 bytes -> 8 shorts.
+
+      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
+      "urshr       v1.8h, v1.8h, #2              \n"
+      "urshr       v2.8h, v2.8h, #2              \n"
+
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_rgba_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+
 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
                        int src_stride_rgb24,
                        uint8_t* dst_u,
@@ -3329,19 +3483,18 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
   );
 }
 
-// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the ArgbConstants layout.
+// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout.
 static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
-                                             int src_stride,
-                                             uint8_t* dst_u,
-                                             uint8_t* dst_v,
-                                             int width,
-                                             const struct ArgbConstants* c) {
+                                        int src_stride,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width,
+                                        const int8_t* uvconstants) {
   const uint8_t* src1 = src + src_stride;
   asm volatile(
       "movi        v23.8h, #0x80, lsl #8           \n"  // 128.0 (0x8000 in
                                                         // 16-bit)
-      "ldr         q24, [%[c], #16]                \n"  // kRGBToU
-      "ldr         q25, [%[c], #32]                \n"  // kRGBToV
+      "ld2r        {v24.4s, v25.4s}, [%[uvconstants]] \n"
 
       "1:          \n"
       "ld2         {v0.4s, v1.4s}, [%[src]], #32   \n"  // load 8 pixels
@@ -3389,24 +3542,56 @@ static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
       "str         d0, [%[dst_u]], #8              \n"  // store 8 pixels U
       "str         d1, [%[dst_v]], #8              \n"  // store 8 pixels V
       "b.gt        1b                              \n"
-      : [src] "+r"(src),      // %[src]
-        [src1] "+r"(src1),    // %[src1]
-        [dst_u] "+r"(dst_u),  // %[dst_u]
-        [dst_v] "+r"(dst_v),  // %[dst_v]
-        [width] "+r"(width)   // %[width]
-      : [c] "r"(c)            // %[c]
+      : [src] "+r"(src),                // %[src]
+        [src1] "+r"(src1),              // %[src1]
+        [dst_u] "+r"(dst_u),            // %[dst_u]
+        [dst_v] "+r"(dst_v),            // %[dst_v]
+        [width] "+r"(width)             // %[width]
+      : [uvconstants] "r"(uvconstants)  // %[uvconstants]
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v23",
         "v24", "v25");
 }
 
+// RGB to BT601 coefficients
+// UB   0.875 coefficient = 112
+// UG -0.5781 coefficient = -74
+// UR -0.2969 coefficient = -38
+// VB -0.1406 coefficient = -18
+// VG -0.7344 coefficient = -94
+// VR   0.875 coefficient = 112
+// I8MM constants are stored negated such that we can store 128 in int8_t.
+
+static const int8_t kARGBToUVCoefficients[] = {
+    // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
+    -112, 74, 38, 0, 18, 94, -112, 0,
+};
+
+static const int8_t kABGRToUVCoefficients[] = {
+    // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
+    38, 74, -112, 0, -112, 94, 18, 0,
+};
+
+static const int8_t kBGRAToUVCoefficients[] = {
+    // 0, -UR, -UG, -UB, 0, -VR, -VG, -VB
+    0, 38, 74, -112, 0, -112, 94, 18,
+};
+
+static const int8_t kRGBAToUVCoefficients[] = {
+    // 0, -UB, -UG, -UR, 0, -VB, -VG, -VR
+    0, -112, 74, 38, 0, 18, 94, -112,
+};
+
 void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
                                  int src_stride_argb,
                                  uint8_t* dst_u,
                                  uint8_t* dst_v,
                                  int width,
                                  const struct ArgbConstants* c) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v,
-                                   width, c);
+  int8_t uvconstants[8] = {
+      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
+      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
+                                   uvconstants);
 }
 
 void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
@@ -3414,8 +3599,8 @@ void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v,
-                                   width, &kArgbI601Constants);
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
+                              kARGBToUVCoefficients);
 }
 
 void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
@@ -3423,8 +3608,8 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v,
-                                   width, &kAbgrI601Constants);
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
+                              kABGRToUVCoefficients);
 }
 
 void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
@@ -3432,8 +3617,8 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v,
-                                   width, &kBgraI601Constants);
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width,
+                              kBGRAToUVCoefficients);
 }
 
 void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
@@ -3441,17 +3626,36 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v,
-                                   width, &kRgbaI601Constants);
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width,
+                              kRGBAToUVCoefficients);
 }
 
+// RGB to JPEG coefficients
+// UB  0.500    coefficient = 128
+// UG -0.33126  coefficient = -85
+// UR -0.16874  coefficient = -43
+// VB -0.08131  coefficient = -21
+// VG -0.41869  coefficient = -107
+// VR 0.500     coefficient = 128
+// I8MM constants are stored negated such that we can store 128 in int8_t.
+
+static const int8_t kARGBToUVJCoefficients[] = {
+    // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
+    -128, 85, 43, 0, 21, 107, -128, 0,
+};
+
+static const int8_t kABGRToUVJCoefficients[] = {
+    // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
+    43, 85, -128, 0, -128, 107, 21, 0,
+};
+
 void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
                             int src_stride_argb,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v,
-                                   width, &kArgbJPEGConstants);
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
+                              kARGBToUVJCoefficients);
 }
 
 void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
@@ -3459,8 +3663,8 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v,
-                                   width, &kAbgrJPEGConstants);
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
+                              kABGRToUVJCoefficients);
 }
 
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@@ -3559,184 +3763,251 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
       : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
 }
 
+
+
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
-                           uint8_t* dst_y,
-                           int width,
-                           const struct ArgbConstants* c) {
+                                  uint8_t* dst_y,
+                                  int width,
+                                  const struct ArgbConstants* c) {
   asm volatile(
-      "ldr         s16, [%3]                     \n"  // load 4 coeffs
-      "ldr         s17, [%3, #48]                \n"  // load kAddY[0]
-      "dup         v18.16b, v16.b[0]             \n"  // B
-      "dup         v19.16b, v16.b[1]             \n"  // G
-      "dup         v20.16b, v16.b[2]             \n"  // R
-      "dup         v21.16b, v16.b[3]             \n"  // A
-      "dup         v22.8h,  v17.h[0]             \n"  // bias
+      "ldr         s0, [%3]                      \n"  // load rgbconstants
+      "ldr         s1, [%3, #48]                 \n"
+      "dup         v6.16b, v0.b[0]               \n"
+      "dup         v7.16b, v0.b[1]               \n"
+      "dup         v16.16b, v0.b[2]              \n"
+      "dup         v17.8h,  v1.h[0]              \n"
       "1:          \n"
       "ld4         {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n"  // load 16
+                                                                 // pixels.
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
-      "umull       v0.8h, v2.8b, v18.8b          \n"  // B
-      "umull2      v1.8h, v2.16b, v18.16b        \n"
+      "umull       v0.8h, v2.8b, v6.8b           \n"  // B
+      "umull2      v1.8h, v2.16b, v6.16b         \n"
       "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v0.8h, v3.8b, v19.8b          \n"  // G
-      "umlal2      v1.8h, v3.16b, v19.16b        \n"
-      "umlal       v0.8h, v4.8b, v20.8b          \n"  // R
-      "umlal2      v1.8h, v4.16b, v20.16b        \n"
-      "umlal       v0.8h, v5.8b, v21.8b          \n"  // A
-      "umlal2      v1.8h, v5.16b, v21.16b        \n"
-      "addhn       v0.8b, v0.8h, v22.8h          \n"  // 16 bit to 8 bit Y
-      "addhn       v1.8b, v1.8h, v22.8h          \n"
+      "umlal       v0.8h, v3.8b, v7.8b           \n"  // G
+      "umlal2      v1.8h, v3.16b, v7.16b         \n"
+      "umlal       v0.8h, v4.8b, v16.8b          \n"  // R
+      "umlal2      v1.8h, v4.16b, v16.16b        \n"
+      "addhn       v0.8b, v0.8h, v17.8h          \n"  // 16 bit to 8 bit Y
+      "addhn       v1.8b, v1.8h, v17.8h          \n"
       "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
       "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      : "r"(c)           // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
-        "v19", "v20", "v21", "v22");
+      : "+r"(src_argb),    // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      : "r"(c)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17");
 }
 
-void ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb,
-                                   uint8_t* dst_y,
-                                   int width,
-                                   const struct ArgbConstants* c) {
+void ARGBToYMatrixRow_NEON_DotProd(
+    const uint8_t* src_argb,
+    uint8_t* dst_y,
+    int width,
+    const struct ArgbConstants* c) {
   asm volatile(
-      "ldr         s16, [%3]                     \n"  // load 4 coeffs
-      "ldr         s17, [%3, #48]                \n"  // load kAddY[0]
-      "dup         v18.4s, v16.s[0]              \n"
-      "dup         v19.8h, v17.h[0]              \n"
+      "ldr         s0, [%3]                      \n"  // load rgbconstants
+      "ldr         s1, [%3, #48]                 \n"
+      "dup         v16.4s, v0.s[0]               \n"
+      "dup         v17.8h,  v1.h[0]              \n"
       "1:          \n"
       "ld1         {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n"  // load 16
+                                                                    // pixels.
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
       "movi        v0.16b, #0                    \n"
       "movi        v1.16b, #0                    \n"
       "movi        v2.16b, #0                    \n"
       "movi        v3.16b, #0                    \n"
-      "udot        v0.4s, v4.16b, v18.16b        \n"
-      "udot        v1.4s, v5.16b, v18.16b        \n"
-      "udot        v2.4s, v6.16b, v18.16b        \n"
-      "udot        v3.4s, v7.16b, v18.16b        \n"
+      "udot        v0.4s, v4.16b, v16.16b        \n"
+      "udot        v1.4s, v5.16b, v16.16b        \n"
+      "udot        v2.4s, v6.16b, v16.16b        \n"
+      "udot        v3.4s, v7.16b, v16.16b        \n"
       "uzp1        v0.8h, v0.8h, v1.8h           \n"
       "uzp1        v1.8h, v2.8h, v3.8h           \n"
-      "addhn       v0.8b, v0.8h, v19.8h          \n"
-      "addhn       v1.8b, v1.8h, v19.8h          \n"
+      "addhn       v0.8b, v0.8h, v17.8h          \n"
+      "addhn       v1.8b, v1.8h, v17.8h          \n"
       "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
       "b.gt        1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      : "r"(c)           // %3
+      : "+r"(src_argb),    // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      : "r"(c)  // %3
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v17", "v18", "v19");
+        "v17");
 }
 
 // RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5
+static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {}, {}, {0x0080}, {}};
+static const struct ArgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, {}, {}, {0x0080}, {}};
+
+static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {}, {}, {0x0080}, {}};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {}, {}, {0x1080}, {}};
+static const struct ArgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66}, {}, {}, {0x1080}, {}};
+
+static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {}, {}, {0x1080}, {}};
+static const struct ArgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25}, {}, {}, {0x1080}, {}};
 
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kArgbI601Constants);
+  ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
 }
 
 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
-  ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kArgbJPEGConstants);
+  ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
 }
 
 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kAbgrI601Constants);
+  ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
 }
 
 void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
-  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
+  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
 }
 
 void ARGBToYRow_NEON_DotProd(const uint8_t* src_argb,
                              uint8_t* dst_y,
                              int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kArgbI601Constants);
+  ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kRgb24I601Constants);
 }
 
 void ARGBToYJRow_NEON_DotProd(const uint8_t* src_argb,
                               uint8_t* dst_yj,
                               int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kArgbJPEGConstants);
+  ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kRgb24JPEGConstants);
 }
 
 void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr,
                              uint8_t* dst_y,
                              int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kAbgrI601Constants);
+  ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kRawI601Constants);
 }
 
 void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr,
                               uint8_t* dst_yj,
                               int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
+  ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kRawJPEGConstants);
 }
 
 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
+                                  uint8_t* dst_y,
+                                  int width,
+                                  const struct ArgbConstants* c) {
+  asm volatile(
+      "ldr         s0, [%3]                      \n"  // load rgbconstants
+      "ldr         s1, [%3, #48]                 \n"
+      "dup         v6.16b, v0.b[0]               \n"
+      "dup         v7.16b, v0.b[1]               \n"
+      "dup         v16.16b, v0.b[2]              \n"
+      "dup         v17.8h,  v1.h[0]              \n"
+      "1:          \n"
+      "ld4         {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n"  // load 16
+                                                                 // pixels.
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
+      "umull       v0.8h, v2.8b, v6.8b           \n"  // B
+      "umull2      v1.8h, v2.16b, v6.16b         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umlal       v0.8h, v3.8b, v7.8b           \n"  // G
+      "umlal2      v1.8h, v3.16b, v7.16b         \n"
+      "umlal       v0.8h, v4.8b, v16.8b          \n"  // R
+      "umlal2      v1.8h, v4.16b, v16.16b        \n"
+      "addhn       v0.8b, v0.8h, v17.8h          \n"  // 16 bit to 8 bit Y
+      "addhn       v1.8b, v1.8h, v17.8h          \n"
+      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
+      "b.gt        1b                            \n"
+      : "+r"(src_rgba),    // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      : "r"(c)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17");
+}
 
 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  ARGBToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgbaI601Constants);
+  RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
 }
 
 void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
-  ARGBToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
+  RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
 }
 
 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  ARGBToYMatrixRow_NEON(src_bgra, dst_y, width, &kBgraI601Constants);
+  RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
 }
 
 void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba,
                              uint8_t* dst_y,
                              int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width, &kRgbaI601Constants);
+  // No need for a separate implementation for RGBA inputs, just permute the
+  // RGB constants.
+  ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width,
+                                &kRgb24I601DotProdConstants);
 }
 
 void RGBAToYJRow_NEON_DotProd(const uint8_t* src_rgba,
                               uint8_t* dst_yj,
                               int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
+  // No need for a separate implementation for RGBA inputs, just permute the
+  // RGB constants.
+  ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width,
+                                &kRgb24JPEGDotProdConstants);
 }
 
 void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra,
                              uint8_t* dst_y,
                              int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width, &kBgraI601Constants);
+  // No need for a separate implementation for RGBA inputs, just permute the
+  // RGB constants.
+  ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width,
+                                &kRawI601DotProdConstants);
 }
 
 void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c) {
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct ArgbConstants* c) {
   asm volatile(
-      "ldr         s16, [%3]                     \n"  // load 4 coeffs
-      "ldr         s17, [%3, #48]                \n"  // load kAddY[0]
-      "dup         v18.16b, v16.b[0]             \n"  // B
-      "dup         v19.16b, v16.b[1]             \n"  // G
-      "dup         v20.16b, v16.b[2]             \n"  // R
-      "dup         v21.8h,  v17.h[0]             \n"  // bias
+      "ldr         d0, [%3]                      \n"  // load rgbconstants
+      "dup         v5.16b, v0.b[0]               \n"
+      "dup         v6.16b, v0.b[1]               \n"
+      "dup         v7.16b, v0.b[2]               \n"
+      "dup         v16.8h,  v0.h[2]              \n"
       "1:          \n"
       "ld3         {v2.16b,v3.16b,v4.16b}, [%0], #48 \n"  // load 16 pixels.
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
-      "umull       v0.8h, v2.8b, v18.8b          \n"  // B
-      "umull2      v1.8h, v2.16b, v18.16b        \n"
+      "umull       v0.8h, v2.8b, v5.8b           \n"  // B
+      "umull2      v1.8h, v2.16b, v5.16b         \n"
       "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v0.8h, v3.8b, v19.8b          \n"  // G
-      "umlal2      v1.8h, v3.16b, v19.16b        \n"
-      "umlal       v0.8h, v4.8b, v20.8b          \n"  // R
-      "umlal2      v1.8h, v4.16b, v20.16b        \n"
-      "addhn       v0.8b, v0.8h, v21.8h          \n"  // 16 bit to 8 bit Y
-      "addhn       v1.8b, v1.8h, v21.8h          \n"
+      "umlal       v0.8h, v3.8b, v6.8b           \n"  // G
+      "umlal2      v1.8h, v3.16b, v6.16b         \n"
+      "umlal       v0.8h, v4.8b, v7.8b           \n"  // R
+      "umlal2      v1.8h, v4.16b, v7.16b         \n"
+      "addhn       v0.8b, v0.8h, v16.8h          \n"  // 16 bit to 8 bit Y
+      "addhn       v1.8b, v1.8h, v16.8h          \n"
       "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
       "b.gt        1b                            \n"
-      : "+r"(src_rgb),  // %0
-        "+r"(dst_y),    // %1
-        "+r"(width)     // %2
-      : "r"(c)          // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18",
-        "v19", "v20", "v21");
+      : "+r"(src_rgb),     // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      : "r"(c)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
 }
 
+
+
+
+
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_NEON(uint8_t* dst_ptr,
                          const uint8_t* src_ptr,
@@ -4744,10 +5015,10 @@ void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
       "str         s2, [%1], #4                  \n"  // store 1 floats
       "b.gt        2b                            \n"
       "3:          \n"
-      : "+r"(src),                      // %0
-        "+r"(dst),                      // %1
-        "+r"(width)                     // %2
-      : "r"((ptrdiff_t)src_stride * 2)  // %3
+      : "+r"(src),                        // %0
+        "+r"(dst),                        // %1
+        "+r"(width)                       // %2
+      : "r"((ptrdiff_t)(src_stride * 2))  // %3
       : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
index 91752ed16..93bc431bc 100644
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@@ -1249,22 +1249,16 @@ void MergeUVRow_RVV(const uint8_t* src_u,
 }
 #endif
 
+
+
 // RGB to JPeg coefficients
 // B * 0.1140 coefficient = 29
 // G * 0.5870 coefficient = 150
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
-static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
-                                                         {0},
-                                                         {0},
-                                                         {128},
-                                                         {0}};
+static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {0}, {0}, {128}, {0}};
 
-static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
-                                                       {0},
-                                                       {0},
-                                                       {128},
-                                                       {0}};
+static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {0}, {0}, {128}, {0}};
 
 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@@ -1272,24 +1266,16 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
 // R * 0.2578 coefficient = 66
 // Add 16.5 = 0x1080
 
-static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                         {0},
-                                                         {0},
-                                                         {0x1080},
-                                                         {0}};
+static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {0}, {0}, {0x1080}, {0}};
 
-static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0},
-                                                       {0},
-                                                       {0},
-                                                       {0x1080},
-                                                       {0}};
+static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {0}, {0}, {0x1080}, {0}};
 
 // ARGB expects first 3 values to contain RGB and 4th value is ignored
 #ifdef HAS_ARGBTOYMATRIXROW_RVV
 void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c) {
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct ArgbConstants* c) {
   assert(width != 0);
   size_t w = (size_t)width;
   vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
diff --git a/source/row_sme.cc b/source/row_sme.cc
index 2291562e2..fca536dc4 100644
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@@ -1127,10 +1127,9 @@ __arm_locally_streaming void ARGBToUVMatrixRow_SME(
     uint8_t* dst_v,
     int width,
     const struct ArgbConstants* c) {
-  int8_t uvconstants[8] = {(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1],
-                           (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
-                           (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1],
-                           (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
+  int8_t uvconstants[8] = {
+      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
+      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
   ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
                            uvconstants);
 }
diff --git a/source/row_sve.cc b/source/row_sve.cc
index 662685882..7d8734921 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -223,10 +223,9 @@ void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
                             uint8_t* dst_v,
                             int width,
                             const struct ArgbConstants* c) {
-  int8_t uvconstants[8] = {(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1],
-                           (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
-                           (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1],
-                           (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
+  int8_t uvconstants[8] = {
+      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
+      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
   ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
                            uvconstants);
 }
diff --git a/source/row_win.cc b/source/row_win.cc
index a7ed75199..77070d031 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -8,19 +8,19 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/convert_from_argb.h"  // For ArgbConstants
 #include "libyuv/row.h"
+#include "libyuv/convert_from_argb.h"  // For ArgbConstants
 
 // This module is for Visual C 32/64 bit
-#if !defined(LIBYUV_DISABLE_X86) &&                                 \
-    (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || \
-     defined(_M_X86)) &&                                            \
-    ((defined(_MSC_VER) && !defined(__clang__)) ||                  \
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || defined(__i386__) || \
+     defined(_M_X64) || defined(_M_X86)) && \
+    ((defined(_MSC_VER) && !defined(__clang__)) || \
      defined(LIBYUV_ENABLE_ROWWIN))
 
 #include <emmintrin.h>
-#include <immintrin.h>  // For AVX2 intrinsics
 #include <tmmintrin.h>  // For _mm_maddubs_epi16
+#include <immintrin.h>  // For AVX2 intrinsics
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -102,91 +102,42 @@ extern "C" {
   _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
   dst_argb += 32;
 
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
+#if defined(HAS_I422TOARGBROW_SSSE3)
+
+#endif
+
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+
+#endif
+
+#if defined(HAS_I444TOARGBROW_SSSE3)
+
+#endif
+
+#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
+
+#endif
+
+#if defined(HAS_ARGBTOYROW_AVX2)
 
 #if defined(__clang__) || defined(__GNUC__)
 #define LIBYUV_TARGET_AVX2 __attribute__((target("avx2")))
-#define LIBYUV_TARGET_AVX512BW \
-  __attribute__((target("avx512bw,avx512vl,avx512f")))
+#define LIBYUV_TARGET_AVX512BW __attribute__((target("avx512bw,avx512vl,avx512f")))
 #else
 #define LIBYUV_TARGET_AVX2
 #define LIBYUV_TARGET_AVX512BW
 #endif
 
-// Convert 32 ARGB pixels (128 bytes) to 32 UV444 values.
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2) || defined(HAS_ARGBTOUV444MATRIXROW_AVX2)
-LIBYUV_TARGET_AVX2
-void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
-                               uint8_t* dst_u,
-                               uint8_t* dst_v,
-                               int width,
-                               const struct ArgbConstants* c) {
-  __m256i ymm_u =
-      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU));
-  __m256i ymm_v =
-      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV));
-  __m256i ymm5 = _mm256_set1_epi16((short)0x8000);
-  __m256i perm_mask = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
-
-  while (width > 0) {
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb);
-    __m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
-    __m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + 64));
-    __m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + 96));
-    src_argb += 128;
-
-    __m256i ymm0_u = _mm256_maddubs_epi16(ymm0, ymm_u);
-    __m256i ymm1_u = _mm256_maddubs_epi16(ymm1, ymm_u);
-    __m256i ymm2_u = _mm256_maddubs_epi16(ymm2, ymm_u);
-    __m256i ymm3_u = _mm256_maddubs_epi16(ymm3, ymm_u);
-
-    __m256i ymm0_v = _mm256_maddubs_epi16(ymm0, ymm_v);
-    __m256i ymm1_v = _mm256_maddubs_epi16(ymm1, ymm_v);
-    __m256i ymm2_v = _mm256_maddubs_epi16(ymm2, ymm_v);
-    __m256i ymm3_v = _mm256_maddubs_epi16(ymm3, ymm_v);
-
-    ymm0_u = _mm256_hadd_epi16(ymm0_u, ymm1_u);
-    ymm2_u = _mm256_hadd_epi16(ymm2_u, ymm3_u);
-
-    ymm0_v = _mm256_hadd_epi16(ymm0_v, ymm1_v);
-    ymm2_v = _mm256_hadd_epi16(ymm2_v, ymm3_v);
-
-    ymm0_u = _mm256_sub_epi16(ymm5, ymm0_u);
-    ymm2_u = _mm256_sub_epi16(ymm5, ymm2_u);
-
-    ymm0_v = _mm256_sub_epi16(ymm5, ymm0_v);
-    ymm2_v = _mm256_sub_epi16(ymm5, ymm2_v);
-
-    ymm0_u = _mm256_srli_epi16(ymm0_u, 8);
-    ymm2_u = _mm256_srli_epi16(ymm2_u, 8);
-
-    ymm0_v = _mm256_srli_epi16(ymm0_v, 8);
-    ymm2_v = _mm256_srli_epi16(ymm2_v, 8);
-
-    ymm0_u = _mm256_packus_epi16(ymm0_u, ymm2_u);
-    ymm0_u = _mm256_permutevar8x32_epi32(ymm0_u, perm_mask);
-
-    ymm0_v = _mm256_packus_epi16(ymm0_v, ymm2_v);
-    ymm0_v = _mm256_permutevar8x32_epi32(ymm0_v, perm_mask);
-
-    _mm256_storeu_si256((__m256i*)dst_u, ymm0_u);
-    _mm256_storeu_si256((__m256i*)dst_v, ymm0_v);
-    dst_u += 32;
-    dst_v += 32;
-    width -= 32;
-  }
-}
-#endif
 LIBYUV_TARGET_AVX2
 void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
                            uint8_t* dst_y,
                            int width,
                            const struct ArgbConstants* c) {
   __m256i ymm5 = _mm256_set1_epi8((char)0x80);
-  __m256i ymm4 =
-      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToY));
-  __m256i ymm7 =
-      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kAddY));
+  __m128i kRGBToY = _mm_loadu_si128((const __m128i*)c->kRGBToY);
+  __m256i ymm4 = _mm256_broadcastsi128_si256(kRGBToY);
+  __m128i kAddY = _mm_loadu_si128((const __m128i*)c->kAddY);
+  __m256i ymm7 = _mm256_broadcastsi128_si256(kAddY);
   __m256i ymm6 = _mm256_maddubs_epi16(ymm4, ymm5);
   ymm6 = _mm256_hadd_epi16(ymm6, ymm6);
   ymm7 = _mm256_sub_epi16(ymm7, ymm6);
@@ -266,33 +217,27 @@ void BGRAToYRow_AVX2(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
 LIBYUV_TARGET_AVX2
 void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
   __m256i ymm_alpha = _mm256_set1_epi32(0xff000000);
-  __m128i shuf_low =
-      _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
-  __m128i shuf_high =
-      _mm_set_epi8(-1, 13, 14, 15, -1, 10, 11, 12, -1, 7, 8, 9, -1, 4, 5, 6);
+  __m128i shuf_low = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
+  __m128i shuf_high = _mm_set_epi8(-1, 13, 14, 15, -1, 10, 11, 12, -1, 7, 8, 9, -1, 4, 5, 6);
   __m256i ymm_shuf = _mm256_broadcastsi128_si256(shuf_low);
   __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(shuf_high);
 
   while (width > 0) {
     __m128i xmm0 = _mm_loadu_si128((const __m128i*)src_raw);
     __m256i ymm0 = _mm256_castsi128_si256(xmm0);
-    ymm0 = _mm256_inserti128_si256(
-        ymm0, _mm_loadu_si128((const __m128i*)(src_raw + 12)), 1);
+    ymm0 = _mm256_inserti128_si256(ymm0, _mm_loadu_si128((const __m128i*)(src_raw + 12)), 1);
 
     __m128i xmm1 = _mm_loadu_si128((const __m128i*)(src_raw + 24));
     __m256i ymm1 = _mm256_castsi128_si256(xmm1);
-    ymm1 = _mm256_inserti128_si256(
-        ymm1, _mm_loadu_si128((const __m128i*)(src_raw + 36)), 1);
+    ymm1 = _mm256_inserti128_si256(ymm1, _mm_loadu_si128((const __m128i*)(src_raw + 36)), 1);
 
     __m128i xmm2 = _mm_loadu_si128((const __m128i*)(src_raw + 48));
     __m256i ymm2 = _mm256_castsi128_si256(xmm2);
-    ymm2 = _mm256_inserti128_si256(
-        ymm2, _mm_loadu_si128((const __m128i*)(src_raw + 60)), 1);
+    ymm2 = _mm256_inserti128_si256(ymm2, _mm_loadu_si128((const __m128i*)(src_raw + 60)), 1);
 
     __m128i xmm3 = _mm_loadu_si128((const __m128i*)(src_raw + 68));
     __m256i ymm3 = _mm256_castsi128_si256(xmm3);
-    ymm3 = _mm256_inserti128_si256(
-        ymm3, _mm_loadu_si128((const __m128i*)(src_raw + 80)), 1);
+    ymm3 = _mm256_inserti128_si256(ymm3, _mm_loadu_si128((const __m128i*)(src_raw + 80)), 1);
 
     ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
     ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
@@ -318,13 +263,10 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
 
 #ifdef HAS_RAWTOARGBROW_AVX512BW
 LIBYUV_TARGET_AVX512BW
-void RGBToARGBRow_AVX512BW(const uint8_t* src_raw,
-                           uint8_t* dst_argb,
-                           const __m128i* shuffler,
-                           int width) {
+void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const __m128i* shuffler, int width) {
   __m512i zmm_alpha = _mm512_set1_epi32(0xff000000);
-  __m512i zmm_perm =
-      _mm512_set_epi32(12, 11, 10, 9, 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 1, 0);
+  __m512i zmm_perm = _mm512_set_epi32(
+      12, 11, 10, 9, 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 1, 0);
   __m512i zmm_shuf = _mm512_broadcast_i32x4(_mm_loadu_si128(shuffler));
 
   while (width > 0) {
@@ -360,26 +302,20 @@ void RGBToARGBRow_AVX512BW(const uint8_t* src_raw,
 }
 
 LIBYUV_TARGET_AVX512BW
-void RAWToARGBRow_AVX512BW(const uint8_t* src_raw,
-                           uint8_t* dst_argb,
-                           int width) {
-  __m128i shuf =
-      _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
+void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  __m128i shuf = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
   RGBToARGBRow_AVX512BW(src_raw, dst_argb, &shuf, width);
 }
 
 LIBYUV_TARGET_AVX512BW
-void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24,
-                             uint8_t* dst_argb,
-                             int width) {
-  __m128i shuf =
-      _mm_set_epi8(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0);
+void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
+  __m128i shuf = _mm_set_epi8(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0);
   RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, &shuf, width);
 }
 #endif
 
 #ifdef HAS_ARGBTOUVMATRIXROW_AVX2
-LIBYUV_TARGET_AVX2
+LIBYUV_TARGET_AVX2 __attribute__((no_sanitize("cfi-icall")))
 void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
                             int src_stride_argb,
                             uint8_t* dst_u,
@@ -389,19 +325,16 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
   __m256i ymm_u = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU));
   __m256i ymm_v = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV));
   __m256i ymm_0101 = _mm256_set1_epi16(0x0101);
-  __m256i ymm_shuf =
-      _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15, 0,
-                       4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
+  __m256i ymm_shuf = _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
+                                      0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
   __m256i ymm_8000 = _mm256_set1_epi16((short)0x8000);
   __m256i ymm_zero = _mm256_setzero_si256();
 
   while (width > 0) {
     __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb);
     __m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
-    __m256i ymm2 =
-        _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb));
-    __m256i ymm3 =
-        _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32));
+    __m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb));
+    __m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32));
 
     ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
     ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
@@ -470,515 +403,12 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
 }
 #endif
 
-#ifdef HAS_MIRRORROW_AVX2
-LIBYUV_TARGET_AVX2
-void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
-      _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
-  src += width;
-  while (width > 0) {
-    src -= 32;
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src);
-    ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
-    ymm0 = _mm256_permute4x64_epi64(ymm0, 0x4e);
-    _mm256_storeu_si256((__m256i*)dst, ymm0);
-    dst += 32;
-    width -= 32;
-  }
-}
 #endif
 
-#ifdef HAS_MIRRORUVROW_AVX2
-LIBYUV_TARGET_AVX2
-void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
-      _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
-  src_uv += width * 2;
-  while (width > 0) {
-    src_uv -= 32;
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_uv);
-    ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
-    ymm0 = _mm256_permute4x64_epi64(ymm0, 0x4e);
-    _mm256_storeu_si256((__m256i*)dst_uv, ymm0);
-    dst_uv += 32;
-    width -= 16;
-  }
-}
-#endif
-
-#ifdef HAS_MIRRORSPLITUVROW_AVX2
-LIBYUV_TARGET_AVX2
-void MirrorSplitUVRow_AVX2(const uint8_t* src_uv,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width) {
-  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
-      _mm_setr_epi8(14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1));
-  src_uv += width * 2;
-  while (width > 0) {
-    src_uv -= 32;
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_uv);
-    ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
-    ymm0 = _mm256_permute4x64_epi64(ymm0, 0x72);
-    _mm_storeu_si128((__m128i*)dst_u, _mm256_castsi256_si128(ymm0));
-    _mm_storeu_si128((__m128i*)dst_v, _mm256_extracti128_si256(ymm0, 1));
-    dst_u += 16;
-    dst_v += 16;
-    width -= 16;
-  }
-}
-#endif
-
-#ifdef HAS_RGB24MIRRORROW_AVX2
-LIBYUV_TARGET_AVX2
-void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
-                         uint8_t* dst_rgb24,
-                         int width) {
-  __m256i shuf0 =
-      _mm256_setr_epi8(-1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2, -1,
-                       12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2);
-  __m128i shuf1 =
-      _mm_setr_epi8(13, 14, 15, 10, 11, 12, 7, 8, 9, 4, 5, 6, 1, 2, 3, -1);
-
-  src_rgb24 += width * 3 - 96;
-  while (width > 0) {
-    __m128i v0_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 0));
-    __m128i v0_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 15));
-    __m256i v0 =
-        _mm256_inserti128_si256(_mm256_castsi128_si256(v0_lo), v0_hi, 1);
-
-    __m128i v1_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 30));
-    __m128i v1_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 45));
-    __m256i v1 =
-        _mm256_inserti128_si256(_mm256_castsi128_si256(v1_lo), v1_hi, 1);
-
-    __m128i v2_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 60));
-    __m128i v2_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 75));
-    __m256i v2 =
-        _mm256_inserti128_si256(_mm256_castsi128_si256(v2_lo), v2_hi, 1);
-
-    __m128i v3 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 80));
-
-    v0 = _mm256_shuffle_epi8(v0, shuf0);
-    v1 = _mm256_shuffle_epi8(v1, shuf0);
-    v2 = _mm256_shuffle_epi8(v2, shuf0);
-    v3 = _mm_shuffle_epi8(v3, shuf1);
-
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 80), _mm256_castsi256_si128(v0));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 65),
-                     _mm256_extracti128_si256(v0, 1));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 50), _mm256_castsi256_si128(v1));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 35),
-                     _mm256_extracti128_si256(v1, 1));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 20), _mm256_castsi256_si128(v2));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 5),
-                     _mm256_extracti128_si256(v2, 1));
-    _mm_storel_epi64((__m128i*)(dst_rgb24 + 0), v3);
-
-    src_rgb24 -= 96;
-    dst_rgb24 += 96;
-    width -= 32;
-  }
-}
-#endif
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-LIBYUV_TARGET_AVX2
-void InterpolateRow_AVX2(uint8_t* dst_ptr,
-                         const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         int width,
-                         int source_y_fraction) {
-  int y1 = source_y_fraction;
-  int y0 = 256 - y1;
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  __m256i ymm_y = _mm256_set1_epi16((y1 << 8) | y0);
-  __m256i ymm_8080 = _mm256_set1_epi16(0x8080);
-  int i;
-
-  if (y1 == 0) {
-    for (i = 0; i < width; i += 32) {
-      _mm256_storeu_si256((__m256i*)(dst_ptr + i),
-                          _mm256_loadu_si256((const __m256i*)(src_ptr + i)));
-    }
-  } else if (y1 == 128) {
-    for (i = 0; i < width; i += 32) {
-      __m256i row0 = _mm256_loadu_si256((const __m256i*)(src_ptr + i));
-      __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_ptr1 + i));
-      _mm256_storeu_si256((__m256i*)(dst_ptr + i), _mm256_avg_epu8(row0, row1));
-    }
-  } else {
-    for (i = 0; i < width; i += 32) {
-      __m256i row0 = _mm256_loadu_si256((const __m256i*)(src_ptr + i));
-      __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_ptr1 + i));
-      __m256i low = _mm256_unpacklo_epi8(row0, row1);
-      __m256i high = _mm256_unpackhi_epi8(row0, row1);
-      low = _mm256_sub_epi8(low, ymm_8080);
-      high = _mm256_sub_epi8(high, ymm_8080);
-      low = _mm256_maddubs_epi16(ymm_y, low);
-      high = _mm256_maddubs_epi16(ymm_y, high);
-      low = _mm256_add_epi16(low, ymm_8080);
-      high = _mm256_add_epi16(high, ymm_8080);
-      low = _mm256_srli_epi16(low, 8);
-      high = _mm256_srli_epi16(high, 8);
-      _mm256_storeu_si256((__m256i*)(dst_ptr + i),
-                          _mm256_packus_epi16(low, high));
-    }
-  }
-  _mm256_zeroupper();
-}
-#endif
-
-#ifdef HAS_INTERPOLATEROW_16_AVX2
-LIBYUV_TARGET_AVX2
-void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
-                            const uint16_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            int width,
-                            int source_y_fraction) {
-  int y1 = source_y_fraction;
-  int y0 = 256 - y1;
-  const uint16_t* src_ptr1 = src_ptr + src_stride;
-  __m256i ymm_y = _mm256_set1_epi32((y1 << 16) | y0);
-  __m256i ymm_8000 = _mm256_set1_epi16((short)0x8000);
-  __m256i ymm_round = _mm256_set1_epi32(8388736);  // 0x800000 + 128
-  int i;
-
-  if (y1 == 0) {
-    for (i = 0; i < width; i += 16) {
-      _mm256_storeu_si256((__m256i*)(dst_ptr + i),
-                          _mm256_loadu_si256((const __m256i*)(src_ptr + i)));
-    }
-  } else if (y1 == 128) {
-    for (i = 0; i < width; i += 16) {
-      __m256i row0 = _mm256_loadu_si256((const __m256i*)(src_ptr + i));
-      __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_ptr1 + i));
-      _mm256_storeu_si256((__m256i*)(dst_ptr + i),
-                          _mm256_avg_epu16(row0, row1));
-    }
-  } else {
-    for (i = 0; i < width; i += 16) {
-      __m256i row0 = _mm256_loadu_si256((const __m256i*)(src_ptr + i));
-      __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_ptr1 + i));
-      __m256i row0l = _mm256_unpacklo_epi16(row0, row1);
-      __m256i row0h = _mm256_unpackhi_epi16(row0, row1);
-      row0l = _mm256_sub_epi16(row0l, ymm_8000);
-      row0h = _mm256_sub_epi16(row0h, ymm_8000);
-      __m256i resl = _mm256_madd_epi16(row0l, ymm_y);
-      __m256i resh = _mm256_madd_epi16(row0h, ymm_y);
-      resl = _mm256_add_epi32(resl, ymm_round);
-      resh = _mm256_add_epi32(resh, ymm_round);
-      resl = _mm256_srai_epi32(resl, 8);
-      resh = _mm256_srai_epi32(resh, 8);
-      _mm256_storeu_si256((__m256i*)(dst_ptr + i),
-                          _mm256_packus_epi32(resl, resh));
-    }
-  }
-  _mm256_zeroupper();
-}
-#endif
-
-#ifdef HAS_ARGBMIRRORROW_AVX2
-LIBYUV_TARGET_AVX2
-void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  __m256i ymm_shuf = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-  src += width * 4;
-  while (width > 0) {
-    src -= 32;
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src);
-    ymm0 = _mm256_permutevar8x32_epi32(ymm0, ymm_shuf);
-    _mm256_storeu_si256((__m256i*)dst, ymm0);
-    dst += 32;
-    width -= 8;
-  }
-}
-#endif
-
-#ifdef HAS_J400TOARGBROW_AVX2
-alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_0[32] = {
-    0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, 3u, 128u,
-    4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u};
-alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_1[32] = {
-    8u,   8u,   8u,  128u, 9u,   9u,   9u,  128u, 10u,  10u, 10u,
-    128u, 11u,  11u, 11u,  128u, 12u,  12u, 12u,  128u, 13u, 13u,
-    13u,  128u, 14u, 14u,  14u,  128u, 15u, 15u,  15u,  128u};
-
-LIBYUV_TARGET_AVX2
-void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  __m256i ymm_mask0 =
-      _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_0);
-  __m256i ymm_mask1 =
-      _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_1);
-  __m256i ymm_alpha = _mm256_set1_epi32((int)0xff000000u);
-
-  while (width > 0) {
-    __m256i ymm0 =
-        _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)src_y));
-
-    __m256i ymm1 = _mm256_shuffle_epi8(ymm0, ymm_mask0);
-    __m256i ymm2 = _mm256_shuffle_epi8(ymm0, ymm_mask1);
-
-    ymm1 = _mm256_or_si256(ymm1, ymm_alpha);
-    ymm2 = _mm256_or_si256(ymm2, ymm_alpha);
-
-    _mm256_storeu_si256((__m256i*)dst_argb, ymm1);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm2);
-
-    src_y += 16;
-    dst_argb += 64;
-    width -= 16;
-  }
-}
-#endif  // HAS_J400TOARGBROW_AVX2
-
-#ifdef HAS_RGB24TOARGBROW_AVX2
-alignas(16) static const uint8_t kShuffleMaskRGB24ToARGB[2][16] = {
-    {0u, 1u, 2u, 128u, 3u, 4u, 5u, 128u, 6u, 7u, 8u, 128u, 9u, 10u, 11u, 128u},
-    {4u, 5u, 6u, 128u, 7u, 8u, 9u, 128u, 10u, 11u, 12u, 128u, 13u, 14u, 15u,
-     128u}};
-#endif
-
-#ifdef HAS_RGB565TOARGBROW_AVX2
-LIBYUV_TARGET_AVX2
-void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
-                          uint8_t* dst_argb,
-                          int width) {
-  __m256i ymm_scale_rb = _mm256_set1_epi32(0x01080108);
-  __m256i ymm_scale_g = _mm256_set1_epi32(0x20802080);
-  __m256i ymm_mask_b = _mm256_set1_epi16((short)0xf800);
-  __m256i ymm_mask_g = _mm256_set1_epi16(0x07e0);
-  __m256i ymm_mask_a = _mm256_set1_epi16((short)0xff00);
-
-  while (width > 0) {
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_rgb565);
-    __m256i ymm1 = ymm0;
-    __m256i ymm2 = ymm0;
-
-    ymm1 = _mm256_and_si256(ymm1, ymm_mask_b);
-    ymm2 = _mm256_slli_epi16(ymm2, 11);
-    ymm1 = _mm256_mulhi_epu16(ymm1, ymm_scale_rb);
-    ymm2 = _mm256_mulhi_epu16(ymm2, ymm_scale_rb);
-    ymm1 = _mm256_slli_epi16(ymm1, 8);
-    ymm1 = _mm256_or_si256(ymm1, ymm2);  // RB
-
-    ymm0 = _mm256_and_si256(ymm0, ymm_mask_g);
-    ymm0 = _mm256_mulhi_epu16(ymm0, ymm_scale_g);
-    ymm0 = _mm256_or_si256(ymm0, ymm_mask_a);  // GA
-
-    ymm2 = _mm256_unpacklo_epi8(ymm1, ymm0);
-    ymm1 = _mm256_unpackhi_epi8(ymm1, ymm0);
-
-    ymm0 = _mm256_permute2x128_si256(ymm2, ymm1, 0x20);
-    ymm1 = _mm256_permute2x128_si256(ymm2, ymm1, 0x31);
-
-    _mm256_storeu_si256((__m256i*)dst_argb, ymm0);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1);
-
-    src_rgb565 += 32;
-    dst_argb += 64;
-    width -= 16;
-  }
-  _mm256_zeroupper();
-}
-#endif
-
-#ifdef HAS_ARGB1555TOARGBROW_AVX2
-LIBYUV_TARGET_AVX2
-void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
-                            uint8_t* dst_argb,
-                            int width) {
-  __m256i ymm_scale_rb = _mm256_set1_epi32(0x01080108);
-  __m256i ymm_scale_g = _mm256_set1_epi32(0x42004200);
-  __m256i ymm_mask_b = _mm256_set1_epi16((short)0xf800);
-  __m256i ymm_mask_g = _mm256_set1_epi16(0x03e0);
-  __m256i ymm_mask_a = _mm256_set1_epi16((short)0xff00);
-
-  while (width > 0) {
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb1555);
-    __m256i ymm1 = ymm0;
-    __m256i ymm2 = ymm0;
-
-    ymm1 = _mm256_slli_epi16(ymm1, 1);
-    ymm2 = _mm256_slli_epi16(ymm2, 11);
-    ymm1 = _mm256_and_si256(ymm1, ymm_mask_b);
-    ymm2 = _mm256_mulhi_epu16(ymm2, ymm_scale_rb);
-    ymm1 = _mm256_mulhi_epu16(ymm1, ymm_scale_rb);
-    ymm1 = _mm256_slli_epi16(ymm1, 8);
-    ymm1 = _mm256_or_si256(ymm1, ymm2);  // RB
-
-    ymm2 = ymm0;
-    ymm0 = _mm256_and_si256(ymm0, ymm_mask_g);
-    ymm2 = _mm256_srai_epi16(ymm2, 8);
-    ymm0 = _mm256_mulhi_epu16(ymm0, ymm_scale_g);
-    ymm2 = _mm256_and_si256(ymm2, ymm_mask_a);
-    ymm0 = _mm256_or_si256(ymm0, ymm2);  // GA
-
-    ymm2 = _mm256_unpacklo_epi8(ymm1, ymm0);
-    ymm1 = _mm256_unpackhi_epi8(ymm1, ymm0);
-
-    ymm0 = _mm256_permute2x128_si256(ymm2, ymm1, 0x20);
-    ymm1 = _mm256_permute2x128_si256(ymm2, ymm1, 0x31);
-
-    _mm256_storeu_si256((__m256i*)dst_argb, ymm0);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1);
-
-    src_argb1555 += 32;
-    dst_argb += 64;
-    width -= 16;
-  }
-  _mm256_zeroupper();
-}
-#endif
-
-#ifdef HAS_ARGB4444TOARGBROW_AVX2
-LIBYUV_TARGET_AVX2
-void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
-                            uint8_t* dst_argb,
-                            int width) {
-  __m256i ymm_mask = _mm256_set1_epi32(0x0f0f0f0f);
-  __m256i ymm_mask2 = _mm256_slli_epi32(ymm_mask, 4);
-
-  while (width > 0) {
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb4444);
-    __m256i ymm2 = ymm0;
-
-    ymm0 = _mm256_and_si256(ymm0, ymm_mask);
-    ymm2 = _mm256_and_si256(ymm2, ymm_mask2);
-
-    __m256i ymm1 = ymm0;
-    __m256i ymm3 = ymm2;
-
-    ymm1 = _mm256_slli_epi16(ymm1, 4);
-    ymm3 = _mm256_srli_epi16(ymm3, 4);
-
-    ymm0 = _mm256_or_si256(ymm0, ymm1);
-    ymm2 = _mm256_or_si256(ymm2, ymm3);
-
-    ymm1 = ymm0;
-    ymm0 = _mm256_unpacklo_epi8(ymm0, ymm2);
-    ymm1 = _mm256_unpackhi_epi8(ymm1, ymm2);
-
-    ymm2 = _mm256_permute2x128_si256(ymm0, ymm1, 0x20);
-    ymm1 = _mm256_permute2x128_si256(ymm0, ymm1, 0x31);
-
-    _mm256_storeu_si256((__m256i*)dst_argb, ymm2);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1);
-
-    src_argb4444 += 32;
-    dst_argb += 64;
-    width -= 16;
-  }
-  _mm256_zeroupper();
-}
-#endif
-
-#ifdef HAS_RGB24TOARGBROW_AVX2
-LIBYUV_TARGET_AVX2
-void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24,
-                         uint8_t* dst_argb,
-                         int width) {
-  __m256i ymm_alpha = _mm256_set1_epi32(0xff000000);
-  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
-      _mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[0]));
-  __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(
-      _mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[1]));
-
-  while (width > 0) {
-    __m128i xmm0 = _mm_loadu_si128((const __m128i*)src_rgb24);
-    __m256i ymm0 = _mm256_castsi128_si256(xmm0);
-    ymm0 = _mm256_inserti128_si256(
-        ymm0, _mm_loadu_si128((const __m128i*)(src_rgb24 + 12)), 1);
-
-    __m128i xmm1 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 24));
-    __m256i ymm1 = _mm256_castsi128_si256(xmm1);
-    ymm1 = _mm256_inserti128_si256(
-        ymm1, _mm_loadu_si128((const __m128i*)(src_rgb24 + 36)), 1);
-
-    __m128i xmm2 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 48));
-    __m256i ymm2 = _mm256_castsi128_si256(xmm2);
-    ymm2 = _mm256_inserti128_si256(
-        ymm2, _mm_loadu_si128((const __m128i*)(src_rgb24 + 60)), 1);
-
-    __m128i xmm3 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 68));
-    __m256i ymm3 = _mm256_castsi128_si256(xmm3);
-    ymm3 = _mm256_inserti128_si256(
-        ymm3, _mm_loadu_si128((const __m128i*)(src_rgb24 + 80)), 1);
-
-    ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
-    ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
-    ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf);
-    ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf2);
-
-    ymm0 = _mm256_or_si256(ymm0, ymm_alpha);
-    ymm1 = _mm256_or_si256(ymm1, ymm_alpha);
-    ymm2 = _mm256_or_si256(ymm2, ymm_alpha);
-    ymm3 = _mm256_or_si256(ymm3, ymm_alpha);
-
-    _mm256_storeu_si256((__m256i*)dst_argb, ymm0);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 64), ymm2);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 96), ymm3);
-
-    src_rgb24 += 96;
-    dst_argb += 128;
-    width -= 32;
-  }
-  _mm256_zeroupper();
-}
-#endif
-
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-LIBYUV_TARGET_AVX2
-void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
-                         uint8_t* dst_argb,
-                         const uint8_t* shuffler,
-                         int width) {
-  __m256i control =
-      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)shuffler));
-  while (width >= 16) {
-    __m256i row = _mm256_loadu_si256((const __m256i*)src_argb);
-    __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
-    row = _mm256_shuffle_epi8(row, control);
-    row1 = _mm256_shuffle_epi8(row1, control);
-    _mm256_storeu_si256((__m256i*)dst_argb, row);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 32), row1);
-    src_argb += 64;
-    dst_argb += 64;
-    width -= 16;
-  }
-}
-#endif
-
-#ifdef HAS_ARGBSHUFFLEROW_AVX512BW
-LIBYUV_TARGET_AVX512BW
-void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             const uint8_t* shuffler,
-                             int width) {
-  __m512i control =
-      _mm512_broadcast_i32x4(_mm_loadu_si128((const __m128i*)shuffler));
-  while (width >= 32) {
-    __m512i row = _mm512_loadu_si512((const __m512i*)src_argb);
-    __m512i row1 = _mm512_loadu_si512((const __m512i*)(src_argb + 64));
-    row = _mm512_shuffle_epi8(row, control);
-    row1 = _mm512_shuffle_epi8(row1, control);
-    _mm512_storeu_si512((__m512i*)dst_argb, row);
-    _mm512_storeu_si512((__m512i*)(dst_argb + 64), row1);
-    src_argb += 128;
-    dst_argb += 128;
-    width -= 32;
-  }
-}
-#endif
-
-#endif
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) ||
-        // defined(__i386__) || defined(_M_X64) || defined(_M_X86)) &&
-        // ((defined(_MSC_VER) && !defined(__clang__)) ||
-        // defined(LIBYUV_ENABLE_ROWWIN))
+#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_X86)) && ((defined(_MSC_VER) && !defined(__clang__)) || defined(LIBYUV_ENABLE_ROWWIN))
diff --git a/source/scale.cc b/source/scale.cc
index 4b7b2d3bc..9c1e9b264 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -11,7 +11,6 @@
 #include "libyuv/scale.h"
 
 #include <assert.h>
-#include <limits.h>
 #include <string.h>
 
 #include "libyuv/cpu_id.h"
@@ -40,8 +39,8 @@ static void ScalePlaneDown2(int src_width,
                             int src_height,
                             int dst_width,
                             int dst_height,
-                            ptrdiff_t src_stride,
-                            ptrdiff_t dst_stride,
+                            int src_stride,
+                            int dst_stride,
                             const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             enum FilterMode filtering) {
@@ -52,7 +51,7 @@ static void ScalePlaneDown2(int src_width,
           ? ScaleRowDown2_C
           : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
                                         : ScaleRowDown2Box_C);
-  ptrdiff_t row_stride = src_stride * 2;
+  int row_stride = src_stride * 2;
   (void)src_width;
   (void)src_height;
   if (!filtering) {
@@ -152,8 +151,8 @@ static void ScalePlaneDown2_16(int src_width,
                                int src_height,
                                int dst_width,
                                int dst_height,
-                               ptrdiff_t src_stride,
-                               ptrdiff_t dst_stride,
+                               int src_stride,
+                               int dst_stride,
                                const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                enum FilterMode filtering) {
@@ -164,7 +163,7 @@ static void ScalePlaneDown2_16(int src_width,
           ? ScaleRowDown2_16_C
           : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
                                         : ScaleRowDown2Box_16_C);
-  ptrdiff_t row_stride = src_stride * 2;
+  int row_stride = src_stride * 2;
   (void)src_width;
   (void)src_height;
   if (!filtering) {
@@ -229,7 +228,7 @@ void ScalePlaneDown2_16To8(int src_width,
                  ? ScaleRowDown2_16To8_C
                  : (filtering == kFilterLinear ? ScaleRowDown2Linear_16To8_C
                                                : ScaleRowDown2Box_16To8_C));
-  ptrdiff_t row_stride = (ptrdiff_t)src_stride * 2;
+  int row_stride = src_stride * 2;
   (void)dst_height;
   if (!filtering) {
     src_ptr += src_stride;  // Point to odd rows.
@@ -260,8 +259,8 @@ static void ScalePlaneDown4(int src_width,
                             int src_height,
                             int dst_width,
                             int dst_height,
-                            ptrdiff_t src_stride,
-                            ptrdiff_t dst_stride,
+                            int src_stride,
+                            int dst_stride,
                             const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             enum FilterMode filtering) {
@@ -269,7 +268,7 @@ static void ScalePlaneDown4(int src_width,
   void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                         uint8_t* dst_ptr, int dst_width) =
       filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
-  ptrdiff_t row_stride = src_stride * 4;
+  int row_stride = src_stride * 4;
   (void)src_width;
   (void)src_height;
   if (!filtering) {
@@ -332,8 +331,8 @@ static void ScalePlaneDown4_16(int src_width,
                                int src_height,
                                int dst_width,
                                int dst_height,
-                               ptrdiff_t src_stride,
-                               ptrdiff_t dst_stride,
+                               int src_stride,
+                               int dst_stride,
                                const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                enum FilterMode filtering) {
@@ -341,7 +340,7 @@ static void ScalePlaneDown4_16(int src_width,
   void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                         uint16_t* dst_ptr, int dst_width) =
       filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
-  ptrdiff_t row_stride = src_stride * 4;
+  int row_stride = src_stride * 4;
   (void)src_width;
   (void)src_height;
   if (!filtering) {
@@ -376,8 +375,8 @@ static void ScalePlaneDown34(int src_width,
                              int src_height,
                              int dst_width,
                              int dst_height,
-                             ptrdiff_t src_stride,
-                             ptrdiff_t dst_stride,
+                             int src_stride,
+                             int dst_stride,
                              const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              enum FilterMode filtering) {
@@ -386,7 +385,7 @@ static void ScalePlaneDown34(int src_width,
                            uint8_t* dst_ptr, int dst_width);
   void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                            uint8_t* dst_ptr, int dst_width);
-  const ptrdiff_t filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
   (void)src_width;
   (void)src_height;
   assert(dst_width % 3 == 0);
@@ -503,8 +502,8 @@ static void ScalePlaneDown34_16(int src_width,
                                 int src_height,
                                 int dst_width,
                                 int dst_height,
-                                ptrdiff_t src_stride,
-                                ptrdiff_t dst_stride,
+                                int src_stride,
+                                int dst_stride,
                                 const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 enum FilterMode filtering) {
@@ -513,7 +512,7 @@ static void ScalePlaneDown34_16(int src_width,
                            uint16_t* dst_ptr, int dst_width);
   void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                            uint16_t* dst_ptr, int dst_width);
-  const ptrdiff_t filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
   (void)src_width;
   (void)src_height;
   assert(dst_width % 3 == 0);
@@ -589,8 +588,8 @@ static void ScalePlaneDown38(int src_width,
                              int src_height,
                              int dst_width,
                              int dst_height,
-                             ptrdiff_t src_stride,
-                             ptrdiff_t dst_stride,
+                             int src_stride,
+                             int dst_stride,
                              const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              enum FilterMode filtering) {
@@ -599,7 +598,7 @@ static void ScalePlaneDown38(int src_width,
                            uint8_t* dst_ptr, int dst_width);
   void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                            uint8_t* dst_ptr, int dst_width);
-  const ptrdiff_t filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
   assert(dst_width % 3 == 0);
   (void)src_width;
   (void)src_height;
@@ -709,8 +708,8 @@ static void ScalePlaneDown38_16(int src_width,
                                 int src_height,
                                 int dst_width,
                                 int dst_height,
-                                ptrdiff_t src_stride,
-                                ptrdiff_t dst_stride,
+                                int src_stride,
+                                int dst_stride,
                                 const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 enum FilterMode filtering) {
@@ -719,7 +718,7 @@ static void ScalePlaneDown38_16(int src_width,
                            uint16_t* dst_ptr, int dst_width);
   void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                            uint16_t* dst_ptr, int dst_width);
-  const ptrdiff_t filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
   (void)src_width;
   (void)src_height;
   assert(dst_width % 3 == 0);
@@ -902,8 +901,8 @@ static int ScalePlaneBox(int src_width,
                          int src_height,
                          int dst_width,
                          int dst_height,
-                         ptrdiff_t src_stride,
-                         ptrdiff_t dst_stride,
+                         int src_stride,
+                         int dst_stride,
                          const uint8_t* src_ptr,
                          uint8_t* dst_ptr) {
   int j, k;
@@ -968,7 +967,7 @@ static int ScalePlaneBox(int src_width,
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
       int iy = y >> 16;
-      const uint8_t* src = src_ptr + iy * src_stride;
+      const uint8_t* src = src_ptr + iy * (int64_t)src_stride;
       y += dy;
       if (y > max_y) {
         y = max_y;
@@ -991,8 +990,8 @@ static int ScalePlaneBox_16(int src_width,
                             int src_height,
                             int dst_width,
                             int dst_height,
-                            ptrdiff_t src_stride,
-                            ptrdiff_t dst_stride,
+                            int src_stride,
+                            int dst_stride,
                             const uint16_t* src_ptr,
                             uint16_t* dst_ptr) {
   int j, k;
@@ -1025,7 +1024,7 @@ static int ScalePlaneBox_16(int src_width,
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
       int iy = y >> 16;
-      const uint16_t* src = src_ptr + iy * src_stride;
+      const uint16_t* src = src_ptr + iy * (int64_t)src_stride;
       y += dy;
       if (y > max_y) {
         y = max_y;
@@ -1049,8 +1048,8 @@ static int ScalePlaneBilinearDown(int src_width,
                                   int src_height,
                                   int dst_width,
                                   int dst_height,
-                                  ptrdiff_t src_stride,
-                                  ptrdiff_t dst_stride,
+                                  int src_stride,
+                                  int dst_stride,
                                   const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   enum FilterMode filtering) {
@@ -1077,6 +1076,14 @@ static int ScalePlaneBilinearDown(int src_width,
              &dx, &dy);
   src_width = Abs(src_width);
 
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
@@ -1139,7 +1146,7 @@ static int ScalePlaneBilinearDown(int src_width,
 
   for (j = 0; j < dst_height; ++j) {
     int yi = y >> 16;
-    const uint8_t* src = src_ptr + yi * src_stride;
+    const uint8_t* src = src_ptr + yi * (int64_t)src_stride;
     if (filtering == kFilterLinear) {
       ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
     } else {
@@ -1161,8 +1168,8 @@ static int ScalePlaneBilinearDown_16(int src_width,
                                      int src_height,
                                      int dst_width,
                                      int dst_height,
-                                     ptrdiff_t src_stride,
-                                     ptrdiff_t dst_stride,
+                                     int src_stride,
+                                     int dst_stride,
                                      const uint16_t* src_ptr,
                                      uint16_t* dst_ptr,
                                      enum FilterMode filtering) {
@@ -1189,6 +1196,14 @@ static int ScalePlaneBilinearDown_16(int src_width,
              &dx, &dy);
   src_width = Abs(src_width);
 
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_16_Any_SSE2;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_16_Any_SSSE3;
@@ -1230,7 +1245,7 @@ static int ScalePlaneBilinearDown_16(int src_width,
 
   for (j = 0; j < dst_height; ++j) {
     int yi = y >> 16;
-    const uint16_t* src = src_ptr + yi * src_stride;
+    const uint16_t* src = src_ptr + yi * (int64_t)src_stride;
     if (filtering == kFilterLinear) {
       ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
     } else {
@@ -1253,8 +1268,8 @@ static int ScalePlaneBilinearUp(int src_width,
                                 int src_height,
                                 int dst_width,
                                 int dst_height,
-                                ptrdiff_t src_stride,
-                                ptrdiff_t dst_stride,
+                                int src_stride,
+                                int dst_stride,
                                 const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 enum FilterMode filtering) {
@@ -1275,6 +1290,14 @@ static int ScalePlaneBilinearUp(int src_width,
              &dx, &dy);
   src_width = Abs(src_width);
 
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
@@ -1340,7 +1363,7 @@ static int ScalePlaneBilinearUp(int src_width,
   }
   {
     int yi = y >> 16;
-    const uint8_t* src = src_ptr + yi * src_stride;
+    const uint8_t* src = src_ptr + yi * (int64_t)src_stride;
 
     // Allocate 2 row buffers.
     const int row_size = (dst_width + 31) & ~31;
@@ -1349,7 +1372,7 @@ static int ScalePlaneBilinearUp(int src_width,
       return 1;
 
     uint8_t* rowptr = row;
-    ptrdiff_t rowstride = row_size;
+    int rowstride = row_size;
     int lasty = yi;
 
     ScaleFilterCols(rowptr, src, dst_width, x, dx);
@@ -1367,7 +1390,7 @@ static int ScalePlaneBilinearUp(int src_width,
         if (y > max_y) {
           y = max_y;
           yi = y >> 16;
-          src = src_ptr + yi * src_stride;
+          src = src_ptr + yi * (int64_t)src_stride;
         }
         if (yi != lasty) {
           ScaleFilterCols(rowptr, src, dst_width, x, dx);
@@ -1402,8 +1425,8 @@ static void ScalePlaneUp2_Linear(int src_width,
                                  int src_height,
                                  int dst_width,
                                  int dst_height,
-                                 ptrdiff_t src_stride,
-                                 ptrdiff_t dst_stride,
+                                 int src_stride,
+                                 int dst_stride,
                                  const uint8_t* src_ptr,
                                  uint8_t* dst_ptr) {
   void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
@@ -1446,13 +1469,13 @@ static void ScalePlaneUp2_Linear(int src_width,
 #endif
 
   if (dst_height == 1) {
-    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
+    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
                dst_width);
   } else {
     dy = FixedDiv(src_height - 1, dst_height - 1);
     y = (1 << 15) - 1;
     for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
+      ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
       dst_ptr += dst_stride;
       y += dy;
     }
@@ -1467,8 +1490,8 @@ static void ScalePlaneUp2_Bilinear(int src_width,
                                    int src_height,
                                    int dst_width,
                                    int dst_height,
-                                   ptrdiff_t src_stride,
-                                   ptrdiff_t dst_stride,
+                                   int src_stride,
+                                   int dst_stride,
                                    const uint8_t* src_ptr,
                                    uint8_t* dst_ptr) {
   void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
@@ -1533,8 +1556,8 @@ static void ScalePlaneUp2_12_Linear(int src_width,
                                     int src_height,
                                     int dst_width,
                                     int dst_height,
-                                    ptrdiff_t src_stride,
-                                    ptrdiff_t dst_stride,
+                                    int src_stride,
+                                    int dst_stride,
                                     const uint16_t* src_ptr,
                                     uint16_t* dst_ptr) {
   void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
@@ -1566,13 +1589,13 @@ static void ScalePlaneUp2_12_Linear(int src_width,
 #endif
 
   if (dst_height == 1) {
-    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
+    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
                dst_width);
   } else {
     dy = FixedDiv(src_height - 1, dst_height - 1);
     y = (1 << 15) - 1;
     for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
+      ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
       dst_ptr += dst_stride;
       y += dy;
     }
@@ -1588,8 +1611,8 @@ static void ScalePlaneUp2_12_Bilinear(int src_width,
                                       int src_height,
                                       int dst_width,
                                       int dst_height,
-                                      ptrdiff_t src_stride,
-                                      ptrdiff_t dst_stride,
+                                      int src_stride,
+                                      int dst_stride,
                                       const uint16_t* src_ptr,
                                       uint16_t* dst_ptr) {
   void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
@@ -1636,8 +1659,8 @@ static void ScalePlaneUp2_16_Linear(int src_width,
                                     int src_height,
                                     int dst_width,
                                     int dst_height,
-                                    ptrdiff_t src_stride,
-                                    ptrdiff_t dst_stride,
+                                    int src_stride,
+                                    int dst_stride,
                                     const uint16_t* src_ptr,
                                     uint16_t* dst_ptr) {
   void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
@@ -1669,13 +1692,13 @@ static void ScalePlaneUp2_16_Linear(int src_width,
 #endif
 
   if (dst_height == 1) {
-    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
+    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
                dst_width);
   } else {
     dy = FixedDiv(src_height - 1, dst_height - 1);
     y = (1 << 15) - 1;
     for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
+      ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
       dst_ptr += dst_stride;
       y += dy;
     }
@@ -1686,8 +1709,8 @@ static void ScalePlaneUp2_16_Bilinear(int src_width,
                                       int src_height,
                                       int dst_width,
                                       int dst_height,
-                                      ptrdiff_t src_stride,
-                                      ptrdiff_t dst_stride,
+                                      int src_stride,
+                                      int dst_stride,
                                       const uint16_t* src_ptr,
                                       uint16_t* dst_ptr) {
   void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
@@ -1734,8 +1757,8 @@ static int ScalePlaneBilinearUp_16(int src_width,
                                    int src_height,
                                    int dst_width,
                                    int dst_height,
-                                   ptrdiff_t src_stride,
-                                   ptrdiff_t dst_stride,
+                                   int src_stride,
+                                   int dst_stride,
                                    const uint16_t* src_ptr,
                                    uint16_t* dst_ptr,
                                    enum FilterMode filtering) {
@@ -1756,6 +1779,14 @@ static int ScalePlaneBilinearUp_16(int src_width,
              &dx, &dy);
   src_width = Abs(src_width);
 
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_16_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_16_Any_SSSE3;
@@ -1807,12 +1838,12 @@ static int ScalePlaneBilinearUp_16(int src_width,
   }
   {
     int yi = y >> 16;
-    const uint16_t* src = src_ptr + yi * src_stride;
+    const uint16_t* src = src_ptr + yi * (int64_t)src_stride;
 
     // Allocate 2 row buffers.
     const int row_size = (dst_width + 31) & ~31;
     align_buffer_64(row, row_size * 4);
-    ptrdiff_t rowstride = row_size;
+    int rowstride = row_size;
     int lasty = yi;
     uint16_t* rowptr = (uint16_t*)row;
     if (!row)
@@ -1833,7 +1864,7 @@ static int ScalePlaneBilinearUp_16(int src_width,
         if (y > max_y) {
           y = max_y;
           yi = y >> 16;
-          src = src_ptr + yi * src_stride;
+          src = src_ptr + yi * (int64_t)src_stride;
         }
         if (yi != lasty) {
           ScaleFilterCols(rowptr, src, dst_width, x, dx);
@@ -1868,8 +1899,8 @@ static void ScalePlaneSimple(int src_width,
                              int src_height,
                              int dst_width,
                              int dst_height,
-                             ptrdiff_t src_stride,
-                             ptrdiff_t dst_stride,
+                             int src_stride,
+                             int dst_stride,
                              const uint8_t* src_ptr,
                              uint8_t* dst_ptr) {
   int i;
@@ -1894,7 +1925,8 @@ static void ScalePlaneSimple(int src_width,
   }
 
   for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x,
+              dx);
     dst_ptr += dst_stride;
     y += dy;
   }
@@ -1904,8 +1936,8 @@ static void ScalePlaneSimple_16(int src_width,
                                 int src_height,
                                 int dst_width,
                                 int dst_height,
-                                ptrdiff_t src_stride,
-                                ptrdiff_t dst_stride,
+                                int src_stride,
+                                int dst_stride,
                                 const uint16_t* src_ptr,
                                 uint16_t* dst_ptr) {
   int i;
@@ -1930,7 +1962,8 @@ static void ScalePlaneSimple_16(int src_width,
   }
 
   for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x,
+              dx);
     dst_ptr += dst_stride;
     y += dy;
   }
@@ -1948,14 +1981,6 @@ int ScalePlane(const uint8_t* src,
                int dst_width,
                int dst_height,
                enum FilterMode filtering) {
-  // Reject dimensions larger than 32768 (or smaller than -32768 for height).
-  // This prevents FixedDiv signed integer overflows that can lead to division
-  // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations.
-  if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 ||
-      src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 ||
-      dst_height <= 0) {
-    return -1;
-  }
   // Simplify filtering when possible.
   filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                 filtering);
@@ -1963,7 +1988,7 @@ int ScalePlane(const uint8_t* src,
   // Negative height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src = src + (src_height - 1) * (ptrdiff_t)src_stride;
+    src = src + (src_height - 1) * (int64_t)src_stride;
     src_stride = -src_stride;
   }
   // Use specialized scales to improve performance for common resolutions.
@@ -2056,14 +2081,6 @@ int ScalePlane_16(const uint16_t* src,
                   int dst_width,
                   int dst_height,
                   enum FilterMode filtering) {
-  // Reject dimensions larger than 32768 (or smaller than -32768 for height).
-  // This prevents FixedDiv signed integer overflows that can lead to division
-  // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations.
-  if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 ||
-      src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 ||
-      dst_height <= 0) {
-    return -1;
-  }
   // Simplify filtering when possible.
   filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                 filtering);
@@ -2071,7 +2088,7 @@ int ScalePlane_16(const uint16_t* src,
   // Negative height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src = src + (src_height - 1) * (ptrdiff_t)src_stride;
+    src = src + (src_height - 1) * (int64_t)src_stride;
     src_stride = -src_stride;
   }
   // Use specialized scales to improve performance for common resolutions.
@@ -2168,14 +2185,6 @@ int ScalePlane_12(const uint16_t* src,
                   int dst_width,
                   int dst_height,
                   enum FilterMode filtering) {
-  // Reject dimensions larger than 32768 (or smaller than -32768 for height).
-  // This prevents FixedDiv signed integer overflows that can lead to division
-  // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations.
-  if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 ||
-      src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 ||
-      dst_height <= 0) {
-    return -1;
-  }
   // Simplify filtering when possible.
   filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                 filtering);
@@ -2183,7 +2192,7 @@ int ScalePlane_12(const uint16_t* src,
   // Negative height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src = src + (src_height - 1) * (ptrdiff_t)src_stride;
+    src = src + (src_height - 1) * (int64_t)src_stride;
     src_stride = -src_stride;
   }
 
@@ -2224,17 +2233,17 @@ int I420Scale(const uint8_t* src_y,
               int dst_width,
               int dst_height,
               enum FilterMode filtering) {
-  int r;
-
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
-    return -1;
-  }
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
 
   r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
                  dst_stride_y, dst_width, dst_height, filtering);
@@ -2269,17 +2278,17 @@ int I420Scale_16(const uint16_t* src_y,
                  int dst_width,
                  int dst_height,
                  enum FilterMode filtering) {
-  int r;
-
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
-    return -1;
-  }
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
 
   r = ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y,
                     dst_stride_y, dst_width, dst_height, filtering);
@@ -2314,17 +2323,17 @@ int I420Scale_12(const uint16_t* src_y,
                  int dst_width,
                  int dst_height,
                  enum FilterMode filtering) {
-  int r;
-
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
-    return -1;
-  }
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
 
   r = ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y,
                     dst_stride_y, dst_width, dst_height, filtering);
@@ -2365,8 +2374,8 @@ int I444Scale(const uint8_t* src_y,
   int r;
 
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
@@ -2406,8 +2415,8 @@ int I444Scale_16(const uint16_t* src_y,
   int r;
 
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
@@ -2447,8 +2456,8 @@ int I444Scale_12(const uint16_t* src_y,
   int r;
 
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
@@ -2488,15 +2497,15 @@ int I422Scale(const uint8_t* src_y,
               int dst_width,
               int dst_height,
               enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int r;
 
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
 
   r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
                  dst_stride_y, dst_width, dst_height, filtering);
@@ -2531,15 +2540,15 @@ int I422Scale_16(const uint16_t* src_y,
                  int dst_width,
                  int dst_height,
                  enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int r;
 
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
 
   r = ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y,
                     dst_stride_y, dst_width, dst_height, filtering);
@@ -2574,15 +2583,15 @@ int I422Scale_12(const uint16_t* src_y,
                  int dst_width,
                  int dst_height,
                  enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int r;
 
   if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
 
   r = ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y,
                     dst_stride_y, dst_width, dst_height, filtering);
@@ -2616,17 +2625,17 @@ int NV12Scale(const uint8_t* src_y,
               int dst_width,
               int dst_height,
               enum FilterMode filtering) {
-  int r;
-
-  if (!src_y || !src_uv || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_uv || dst_width <= 0 ||
-      dst_height <= 0) {
-    return -1;
-  }
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int r;
+
+  if (!src_y || !src_uv || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
 
   r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
                  dst_stride_y, dst_width, dst_height, filtering);
@@ -2655,8 +2664,8 @@ int NV24Scale(const uint8_t* src_y,
   int r;
 
   if (!src_y || !src_uv || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_uv || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index 4dc446d5e..506409c15 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -38,8 +38,8 @@ static void ScaleARGBDown2(int src_width,
                            int src_height,
                            int dst_width,
                            int dst_height,
-                           ptrdiff_t src_stride,
-                           ptrdiff_t dst_stride,
+                           int src_stride,
+                           int dst_stride,
                            const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int x,
@@ -48,7 +48,7 @@ static void ScaleARGBDown2(int src_width,
                            int dy,
                            enum FilterMode filtering) {
   int j;
-  ptrdiff_t row_stride = src_stride * (dy >> 16);
+  int row_stride = src_stride * (dy >> 16);
   void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
                             uint8_t* dst_argb, int dst_width) =
       filtering == kFilterNone
@@ -62,9 +62,9 @@ static void ScaleARGBDown2(int src_width,
   assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
   // Advance to odd row, even column.
   if (filtering == kFilterBilinear) {
-    src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+    src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
   } else {
-    src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+    src_argb += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 4;
   }
 
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
@@ -152,8 +152,8 @@ static int ScaleARGBDown4Box(int src_width,
                              int src_height,
                              int dst_width,
                              int dst_height,
-                             ptrdiff_t src_stride,
-                             ptrdiff_t dst_stride,
+                             int src_stride,
+                             int dst_stride,
                              const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int x,
@@ -169,12 +169,12 @@ static int ScaleARGBDown4Box(int src_width,
   align_buffer_64(row, row_size * 2);
   if (!row)
     return 1;
-  ptrdiff_t row_stride = src_stride * (dy >> 16);
+  int row_stride = src_stride * (dy >> 16);
   void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
                             uint8_t* dst_argb, int dst_width) =
       ScaleARGBRowDown2Box_C;
   // Advance to odd row, even column.
-  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
   (void)src_width;
   (void)src_height;
   (void)dx;
@@ -226,8 +226,8 @@ static void ScaleARGBDownEven(int src_width,
                               int src_height,
                               int dst_width,
                               int dst_height,
-                              ptrdiff_t src_stride,
-                              ptrdiff_t dst_stride,
+                              int src_stride,
+                              int dst_stride,
                               const uint8_t* src_argb,
                               uint8_t* dst_argb,
                               int x,
@@ -237,7 +237,7 @@ static void ScaleARGBDownEven(int src_width,
                               enum FilterMode filtering) {
   int j;
   int col_step = dx >> 16;
-  ptrdiff_t row_stride = (dy >> 16) * src_stride;
+  ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride);
   void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
                                int src_step, uint8_t* dst_argb, int dst_width) =
       filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
@@ -245,7 +245,7 @@ static void ScaleARGBDownEven(int src_width,
   (void)src_height;
   assert(IS_ALIGNED(src_width, 2));
   assert(IS_ALIGNED(src_height, 2));
-  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
@@ -302,8 +302,8 @@ static int ScaleARGBBilinearDown(int src_width,
                                  int src_height,
                                  int dst_width,
                                  int dst_height,
-                                 ptrdiff_t src_stride,
-                                 ptrdiff_t dst_stride,
+                                 int src_stride,
+                                 int dst_stride,
                                  const uint8_t* src_argb,
                                  uint8_t* dst_argb,
                                  int x,
@@ -331,6 +331,14 @@ static int ScaleARGBBilinearDown(int src_width,
   clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
   src_argb += xl * 4;
   x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
@@ -405,7 +413,7 @@ static int ScaleARGBBilinearDown(int src_width,
     }
     for (j = 0; j < dst_height; ++j) {
       int yi = y >> 16;
-      const uint8_t* src = src_argb + yi * src_stride;
+      const uint8_t* src = src_argb + yi * (intptr_t)src_stride;
       if (filtering == kFilterLinear) {
         ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
       } else {
@@ -429,8 +437,8 @@ static int ScaleARGBBilinearUp(int src_width,
                                int src_height,
                                int dst_width,
                                int dst_height,
-                               ptrdiff_t src_stride,
-                               ptrdiff_t dst_stride,
+                               int src_stride,
+                               int dst_stride,
                                const uint8_t* src_argb,
                                uint8_t* dst_argb,
                                int x,
@@ -446,6 +454,14 @@ static int ScaleARGBBilinearUp(int src_width,
                               int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
@@ -546,7 +562,7 @@ static int ScaleARGBBilinearUp(int src_width,
 
   {
     int yi = y >> 16;
-    const uint8_t* src = src_argb + yi * src_stride;
+    const uint8_t* src = src_argb + yi * (intptr_t)src_stride;
 
     // Allocate 2 rows of ARGB.
     const int row_size = (dst_width * 4 + 31) & ~31;
@@ -555,7 +571,7 @@ static int ScaleARGBBilinearUp(int src_width,
       return 1;
 
     uint8_t* rowptr = row;
-    ptrdiff_t rowstride = row_size;
+    int rowstride = row_size;
     int lasty = yi;
 
     ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
@@ -573,7 +589,7 @@ static int ScaleARGBBilinearUp(int src_width,
         if (y > max_y) {
           y = max_y;
           yi = y >> 16;
-          src = src_argb + yi * src_stride;
+          src = src_argb + yi * (intptr_t)src_stride;
         }
         if (yi != lasty) {
           ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
@@ -599,6 +615,283 @@ static int ScaleARGBBilinearUp(int src_width,
   return 0;
 }
 
+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+static int ScaleYUVToARGBBilinearUp(int src_width,
+                                    int src_height,
+                                    int dst_width,
+                                    int dst_height,
+                                    int src_stride_y,
+                                    int src_stride_u,
+                                    int src_stride_v,
+                                    int dst_stride_argb,
+                                    const uint8_t* src_y,
+                                    const uint8_t* src_u,
+                                    const uint8_t* src_v,
+                                    uint8_t* dst_argb,
+                                    int x,
+                                    int dx,
+                                    int y,
+                                    int dy,
+                                    enum FilterMode filtering) {
+  int j;
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf, int width) =
+      I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) ==
+      (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
+    if (IS_ALIGNED(src_width, 32)) {
+      I422ToARGBRow = I422ToARGBRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    I422ToARGBRow = I422ToARGBRow_SVE2;
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    I422ToARGBRow = I422ToARGBRow_SME;
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(src_width, 32)) {
+      I422ToARGBRow = I422ToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
+
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    InterpolateRow = InterpolateRow_Any_LSX;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
+
+  void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
+                              int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols =
+        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_LSX)
+  if (filtering && TestCpuFlag(kCpuHasLSX)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_LSX;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_RVV)
+  if (filtering && TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_RVV;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_LSX)
+  if (!filtering && TestCpuFlag(kCpuHasLSX)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_LSX;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBCols_LSX;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  const int max_y = (src_height - 1) << 16;
+  if (y > max_y) {
+    y = max_y;
+  }
+  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
+  int yi = y >> 16;
+  int uv_yi = yi >> kYShift;
+  const uint8_t* src_row_y = src_y + yi * (intptr_t)src_stride_y;
+  const uint8_t* src_row_u = src_u + uv_yi * (intptr_t)src_stride_u;
+  const uint8_t* src_row_v = src_v + uv_yi * (intptr_t)src_stride_v;
+
+  // Allocate 1 row of ARGB for source conversion and 2 rows of ARGB
+  // scaled horizontally to the destination width.
+  const int row_size = (dst_width * 4 + 31) & ~31;
+  align_buffer_64(row, row_size * 2 + src_width * 4);
+
+  uint8_t* argb_row = row + row_size * 2;
+  uint8_t* rowptr = row;
+  int rowstride = row_size;
+  int lasty = yi;
+  if (!row)
+    return 1;
+
+  // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
+  ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+  if (src_height > 1) {
+    src_row_y += src_stride_y;
+    if (yi & 1) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+  ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+  if (src_height > 2) {
+    src_row_y += src_stride_y;
+    if (!(yi & 1)) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    yi = y >> 16;
+    if (yi != lasty) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+        uv_yi = yi >> kYShift;
+        src_row_y = src_y + yi * (intptr_t)src_stride_y;
+        src_row_u = src_u + uv_yi * (intptr_t)src_stride_u;
+        src_row_v = src_v + uv_yi * (intptr_t)src_stride_v;
+      }
+      if (yi != lasty) {
+        // TODO(fbarchard): Convert the clipped region of row.
+        I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+        ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+        rowptr += rowstride;
+        rowstride = -rowstride;
+        lasty = yi;
+        src_row_y += src_stride_y;
+        if (yi & 1) {
+          src_row_u += src_stride_u;
+          src_row_v += src_stride_v;
+        }
+      }
+    }
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
+    dst_argb += dst_stride_argb;
+    y += dy;
+  }
+  free_aligned_buffer_64(row);
+  return 0;
+}
+#endif
+
 // Scale ARGB to/from any dimensions, without interpolation.
 // Fixed point math is used for performance: The upper 16 bits
 // of x and dx is the integer part of the source position and
@@ -608,8 +901,8 @@ static void ScaleARGBSimple(int src_width,
                             int src_height,
                             int dst_width,
                             int dst_height,
-                            ptrdiff_t src_stride,
-                            ptrdiff_t dst_stride,
+                            int src_stride,
+                            int dst_stride,
                             const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int x,
@@ -652,8 +945,8 @@ static void ScaleARGBSimple(int src_width,
   }
 
   for (j = 0; j < dst_height; ++j) {
-    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
-                  dx);
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (intptr_t)src_stride,
+                  dst_width, x, dx);
     dst_argb += dst_stride;
     y += dy;
   }
@@ -688,7 +981,7 @@ static int ScaleARGB(const uint8_t* src,
   // Negative src_height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src = src + (src_height - 1) * (ptrdiff_t)src_stride;
+    src = src + (src_height - 1) * (intptr_t)src_stride;
     src_stride = -src_stride;
   }
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -703,8 +996,8 @@ static int ScaleARGB(const uint8_t* src,
   if (clip_y) {
     int64_t clipf = (int64_t)(clip_y)*dy;
     y += (clipf & 0xffff);
-    src += (clipf >> 16) * (ptrdiff_t)src_stride;
-    dst += clip_y * (ptrdiff_t)dst_stride;
+    src += (clipf >> 16) * (intptr_t)src_stride;
+    dst += clip_y * dst_stride;
   }
 
   // Special case for integer step values.
@@ -737,7 +1030,7 @@ static int ScaleARGB(const uint8_t* src,
         filtering = kFilterNone;
         if (dx == 0x10000 && dy == 0x10000) {
           // Straight copy.
-          ARGBCopy(src + (y >> 16) * (ptrdiff_t)src_stride + (x >> 16) * 4,
+          ARGBCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4,
                    src_stride, dst, dst_stride, clip_width, clip_height);
           return 0;
         }
@@ -779,9 +1072,9 @@ int ARGBScaleClip(const uint8_t* src_argb,
                   int clip_width,
                   int clip_height,
                   enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 || src_height == INT_MIN ||
-      !dst_argb || dst_width <= 0 || dst_height <= 0 || clip_x < 0 ||
-      clip_y < 0 || clip_width > 32768 || clip_height > 32768 ||
+  if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb ||
+      dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 ||
+      clip_width > 32768 || clip_height > 32768 ||
       (clip_x + clip_width) > dst_width ||
       (clip_y + clip_height) > dst_height) {
     return -1;
@@ -802,9 +1095,8 @@ int ARGBScale(const uint8_t* src_argb,
               int dst_width,
               int dst_height,
               enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 || src_height == INT_MIN ||
-      src_width > 32768 || src_height > 32768 || !dst_argb || dst_width <= 0 ||
-      dst_height <= 0) {
+  if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 ||
+      src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
   return ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
@@ -836,13 +1128,12 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
   int r;
   (void)src_fourcc;  // TODO(fbarchard): implement and/or assert.
   (void)dst_fourcc;
+  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
   if (!src_y || !src_u || !src_v || !dst_argb || src_width <= 0 ||
-      src_width > INT_MAX / 4 || src_height == 0 || src_height == INT_MIN ||
-      dst_width <= 0 || dst_height <= 0 || clip_width <= 0 ||
-      clip_height <= 0) {
+      src_width > INT_MAX / 4 || src_height == 0 || dst_width <= 0 ||
+      dst_height <= 0 || clip_width <= 0 || clip_height <= 0) {
     return -1;
   }
-  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
   const uint64_t argb_buffer_size = (uint64_t)src_width * abs_src_height * 4;
   if (argb_buffer_size > SIZE_MAX) {
     return -1;  // Invalid size.
diff --git a/source/scale_common.cc b/source/scale_common.cc
index e2447119b..dff17e3ea 100644
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -792,10 +792,10 @@ void ScaleFilterCols64_C(uint8_t* dst_ptr,
 #undef BLENDER
 
 // Same as 8 bit arm blender but return is cast to uint16_t
-#define BLENDER(a, b, f)                                                      \
-  (uint16_t)((int)(a) +                                                       \
-             (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> \
-                   16))
+#define BLENDER(a, b, f) \
+  (uint16_t)(            \
+      (int)(a) +         \
+      (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16))
 
 void ScaleFilterCols_16_C(uint16_t* dst_ptr,
                           const uint16_t* src_ptr,
@@ -1196,7 +1196,7 @@ void ScaleARGBColsUp2_C(uint8_t* dst_argb,
 
 // TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
 // Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
 #define BLENDERC(a, b, f, s) \
   (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
 #define BLENDER(a, b, f)                                                 \
@@ -1636,6 +1636,14 @@ void ScalePlaneVertical(int src_height,
   assert(dst_width > 0);
   assert(dst_height > 0);
   src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
@@ -1710,6 +1718,14 @@ void ScalePlaneVertical_16(int src_height,
   assert(dst_width > 0);
   assert(dst_height > 0);
   src_argb += (x >> 16) * wpp;
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_16_Any_SSE2;
+    if (IS_ALIGNED(dst_width_words, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_16_Any_SSSE3;
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index 773076669..5338482c5 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -1759,25 +1759,25 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-  asm volatile("pxor        %%xmm5,%%xmm5                 \n"
+      asm volatile("pxor        %%xmm5,%%xmm5                 \n"
 
                // 16 pixel loop.
                LABELALIGN
-               "1:          \n"
-               "movdqu      (%0),%%xmm3                   \n"
-               "lea         0x10(%0),%0                   \n"  // src_ptr += 16
-               "movdqu      (%1),%%xmm0                   \n"
-               "movdqu      0x10(%1),%%xmm1               \n"
-               "movdqa      %%xmm3,%%xmm2                 \n"
-               "punpcklbw   %%xmm5,%%xmm2                 \n"
-               "punpckhbw   %%xmm5,%%xmm3                 \n"
-               "paddusw     %%xmm2,%%xmm0                 \n"
-               "paddusw     %%xmm3,%%xmm1                 \n"
-               "movdqu      %%xmm0,(%1)                   \n"
-               "movdqu      %%xmm1,0x10(%1)               \n"
-               "lea         0x20(%1),%1                   \n"
-               "sub         $0x10,%2                      \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movdqu      (%0),%%xmm3                   \n"
+      "lea         0x10(%0),%0                   \n"  // src_ptr += 16
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      0x10(%1),%%xmm1               \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpckhbw   %%xmm5,%%xmm3                 \n"
+      "paddusw     %%xmm2,%%xmm0                 \n"
+      "paddusw     %%xmm3,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
                : "+r"(src_ptr),   // %0
                  "+r"(dst_ptr),   // %1
                  "+r"(src_width)  // %2
@@ -1790,23 +1790,23 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-  asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
                LABELALIGN
-               "1:          \n"
-               "vmovdqu     (%0),%%ymm3                   \n"
-               "lea         0x20(%0),%0                   \n"  // src_ptr += 32
-               "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
-               "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
-               "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
-               "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
-               "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
-               "vmovdqu     %%ymm0,(%1)                   \n"
-               "vmovdqu     %%ymm1,0x20(%1)               \n"
-               "lea         0x40(%1),%1                   \n"
-               "sub         $0x20,%2                      \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "1:          \n"
+      "vmovdqu     (%0),%%ymm3                   \n"
+      "lea         0x20(%0),%0                   \n"  // src_ptr += 32
+      "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
+      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
+      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
+      "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
                : "+r"(src_ptr),   // %0
                  "+r"(dst_ptr),   // %1
                  "+r"(src_width)  // %2
diff --git a/source/scale_rgb.cc b/source/scale_rgb.cc
index 6040b364e..5e69fe379 100644
--- a/source/scale_rgb.cc
+++ b/source/scale_rgb.cc
@@ -42,8 +42,8 @@ int RGBScale(const uint8_t* src_rgb,
              enum FilterMode filtering) {
   int r;
   if (!src_rgb || !dst_rgb || src_width <= 0 || src_width > INT_MAX / 4 ||
-      src_height == 0 || src_height == INT_MIN || dst_width <= 0 ||
-      dst_width > INT_MAX / 4 || dst_height <= 0) {
+      src_height == 0 || dst_width <= 0 || dst_width > INT_MAX / 4 ||
+      dst_height <= 0) {
     return -1;
   }
   const int abs_src_height = (src_height < 0) ? -src_height : src_height;
diff --git a/source/scale_uv.cc b/source/scale_uv.cc
index 43a464732..3d41a2398 100644
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@@ -11,7 +11,6 @@
 #include "libyuv/scale_uv.h"
 
 #include <assert.h>
-#include <limits.h>
 #include <string.h>
 
 #include "libyuv/cpu_id.h"
@@ -60,8 +59,8 @@ static void ScaleUVDown2(int src_width,
                          int src_height,
                          int dst_width,
                          int dst_height,
-                         ptrdiff_t src_stride,
-                         ptrdiff_t dst_stride,
+                         int src_stride,
+                         int dst_stride,
                          const uint8_t* src_uv,
                          uint8_t* dst_uv,
                          int x,
@@ -70,7 +69,7 @@ static void ScaleUVDown2(int src_width,
                          int dy,
                          enum FilterMode filtering) {
   int j;
-  ptrdiff_t row_stride = src_stride * (dy >> 16);
+  int row_stride = src_stride * (dy >> 16);
   void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
                           uint8_t* dst_uv, int dst_width) =
       filtering == kFilterNone
@@ -84,9 +83,9 @@ static void ScaleUVDown2(int src_width,
   assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
   // Advance to odd row, even column.
   if (filtering == kFilterBilinear) {
-    src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+    src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
   } else {
-    src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2;
+    src_uv += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 2;
   }
 
 #if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
@@ -175,8 +174,8 @@ static int ScaleUVDown4Box(int src_width,
                            int src_height,
                            int dst_width,
                            int dst_height,
-                           ptrdiff_t src_stride,
-                           ptrdiff_t dst_stride,
+                           int src_stride,
+                           int dst_stride,
                            const uint8_t* src_uv,
                            uint8_t* dst_uv,
                            int x,
@@ -189,12 +188,12 @@ static int ScaleUVDown4Box(int src_width,
   align_buffer_64(row, row_size * 2);
   if (!row)
     return 1;
-  ptrdiff_t row_stride = src_stride * (dy >> 16);
+  int row_stride = src_stride * (dy >> 16);
   void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
                           uint8_t* dst_uv, int dst_width) =
       ScaleUVRowDown2Box_C;
   // Advance to odd row, even column.
-  src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+  src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
   (void)src_width;
   (void)src_height;
   (void)dx;
@@ -257,8 +256,8 @@ static void ScaleUVDownEven(int src_width,
                             int src_height,
                             int dst_width,
                             int dst_height,
-                            ptrdiff_t src_stride,
-                            ptrdiff_t dst_stride,
+                            int src_stride,
+                            int dst_stride,
                             const uint8_t* src_uv,
                             uint8_t* dst_uv,
                             int x,
@@ -268,7 +267,7 @@ static void ScaleUVDownEven(int src_width,
                             enum FilterMode filtering) {
   int j;
   int col_step = dx >> 16;
-  ptrdiff_t row_stride = (dy >> 16) * src_stride;
+  ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride);
   void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride,
                              int src_step, uint8_t* dst_uv, int dst_width) =
       filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C;
@@ -276,7 +275,7 @@ static void ScaleUVDownEven(int src_width,
   (void)src_height;
   assert(IS_ALIGNED(src_width, 2));
   assert(IS_ALIGNED(src_height, 2));
-  src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+  src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
 #if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
@@ -335,8 +334,8 @@ static int ScaleUVBilinearDown(int src_width,
                                int src_height,
                                int dst_width,
                                int dst_height,
-                               ptrdiff_t src_stride,
-                               ptrdiff_t dst_stride,
+                               int src_stride,
+                               int dst_stride,
                                const uint8_t* src_uv,
                                uint8_t* dst_uv,
                                int x,
@@ -364,6 +363,14 @@ static int ScaleUVBilinearDown(int src_width,
   clip_src_width = (int)(xr - xl) * 2;  // Width aligned to 2.
   src_uv += xl * 2;
   x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
@@ -423,7 +430,7 @@ static int ScaleUVBilinearDown(int src_width,
     }
     for (j = 0; j < dst_height; ++j) {
       int yi = y >> 16;
-      const uint8_t* src = src_uv + yi * src_stride;
+      const uint8_t* src = src_uv + yi * (intptr_t)src_stride;
       if (filtering == kFilterLinear) {
         ScaleUVFilterCols(dst_uv, src, dst_width, x, dx);
       } else {
@@ -449,8 +456,8 @@ static int ScaleUVBilinearUp(int src_width,
                              int src_height,
                              int dst_width,
                              int dst_height,
-                             ptrdiff_t src_stride,
-                             ptrdiff_t dst_stride,
+                             int src_stride,
+                             int dst_stride,
                              const uint8_t* src_uv,
                              uint8_t* dst_uv,
                              int x,
@@ -466,6 +473,14 @@ static int ScaleUVBilinearUp(int src_width,
                             int dst_width, int x, int dx) =
       filtering ? ScaleUVFilterCols_C : ScaleUVCols_C;
   const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
@@ -544,7 +559,7 @@ static int ScaleUVBilinearUp(int src_width,
 
   {
     int yi = y >> 16;
-    const uint8_t* src = src_uv + yi * src_stride;
+    const uint8_t* src = src_uv + yi * (intptr_t)src_stride;
 
     // Allocate 2 rows of UV.
     const int row_size = (dst_width * 2 + 15) & ~15;
@@ -553,7 +568,7 @@ static int ScaleUVBilinearUp(int src_width,
       return 1;
 
     uint8_t* rowptr = row;
-    ptrdiff_t rowstride = row_size;
+    int rowstride = row_size;
     int lasty = yi;
 
     ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
@@ -571,7 +586,7 @@ static int ScaleUVBilinearUp(int src_width,
         if (y > max_y) {
           y = max_y;
           yi = y >> 16;
-          src = src_uv + yi * src_stride;
+          src = src_uv + yi * (intptr_t)src_stride;
         }
         if (yi != lasty) {
           ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
@@ -607,8 +622,8 @@ static void ScaleUVLinearUp2(int src_width,
                              int src_height,
                              int dst_width,
                              int dst_height,
-                             ptrdiff_t src_stride,
-                             ptrdiff_t dst_stride,
+                             int src_stride,
+                             int dst_stride,
                              const uint8_t* src_uv,
                              uint8_t* dst_uv) {
   void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) =
@@ -646,12 +661,13 @@ static void ScaleUVLinearUp2(int src_width,
 #endif
 
   if (dst_height == 1) {
-    ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width);
+    ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv,
+               dst_width);
   } else {
     dy = FixedDiv(src_height - 1, dst_height - 1);
     y = (1 << 15) - 1;
     for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width);
+      ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width);
       dst_uv += dst_stride;
       y += dy;
     }
@@ -727,8 +743,8 @@ static void ScaleUVLinearUp2_16(int src_width,
                                 int src_height,
                                 int dst_width,
                                 int dst_height,
-                                ptrdiff_t src_stride,
-                                ptrdiff_t dst_stride,
+                                int src_stride,
+                                int dst_stride,
                                 const uint16_t* src_uv,
                                 uint16_t* dst_uv) {
   void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
@@ -760,12 +776,13 @@ static void ScaleUVLinearUp2_16(int src_width,
 #endif
 
   if (dst_height == 1) {
-    ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width);
+    ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv,
+               dst_width);
   } else {
     dy = FixedDiv(src_height - 1, dst_height - 1);
     y = (1 << 15) - 1;
     for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width);
+      ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width);
       dst_uv += dst_stride;
       y += dy;
     }
@@ -835,8 +852,8 @@ static void ScaleUVSimple(int src_width,
                           int src_height,
                           int dst_width,
                           int dst_height,
-                          ptrdiff_t src_stride,
-                          ptrdiff_t dst_stride,
+                          int src_stride,
+                          int dst_stride,
                           const uint8_t* src_uv,
                           uint8_t* dst_uv,
                           int x,
@@ -871,7 +888,8 @@ static void ScaleUVSimple(int src_width,
   }
 
   for (j = 0; j < dst_height; ++j) {
-    ScaleUVCols(dst_uv, src_uv + (y >> 16) * src_stride, dst_width, x, dx);
+    ScaleUVCols(dst_uv, src_uv + (y >> 16) * (intptr_t)src_stride, dst_width, x,
+                dx);
     dst_uv += dst_stride;
     y += dy;
   }
@@ -885,13 +903,13 @@ static int UVCopy(const uint8_t* src_uv,
                   int dst_stride_uv,
                   int width,
                   int height) {
-  if (!src_uv || !dst_uv || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_uv || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_uv = src_uv + (height - 1) * (ptrdiff_t)src_stride_uv;
+    src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv;
     src_stride_uv = -src_stride_uv;
   }
 
@@ -905,13 +923,13 @@ static int UVCopy_16(const uint16_t* src_uv,
                      int dst_stride_uv,
                      int width,
                      int height) {
-  if (!src_uv || !dst_uv || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_uv || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_uv = src_uv + (height - 1) * (ptrdiff_t)src_stride_uv;
+    src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv;
     src_stride_uv = -src_stride_uv;
   }
 
@@ -949,7 +967,7 @@ static int ScaleUV(const uint8_t* src,
   // Negative src_height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src = src + (src_height - 1) * (ptrdiff_t)src_stride;
+    src = src + (src_height - 1) * (intptr_t)src_stride;
     src_stride = -src_stride;
   }
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -964,8 +982,8 @@ static int ScaleUV(const uint8_t* src,
   if (clip_y) {
     int64_t clipf = (int64_t)(clip_y)*dy;
     y += (clipf & 0xffff);
-    src += (clipf >> 16) * (ptrdiff_t)src_stride;
-    dst += clip_y * (ptrdiff_t)dst_stride;
+    src += (clipf >> 16) * (intptr_t)src_stride;
+    dst += clip_y * dst_stride;
   }
 
   // Special case for integer step values.
@@ -1005,8 +1023,9 @@ static int ScaleUV(const uint8_t* src,
 #ifdef HAS_UVCOPY
         if (dx == 0x10000 && dy == 0x10000) {
           // Straight copy.
-          return UVCopy(src + (y >> 16) * (ptrdiff_t)src_stride + (x >> 16) * 2,
-                        src_stride, dst, dst_stride, clip_width, clip_height);
+          UVCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2,
+                 src_stride, dst, dst_stride, clip_width, clip_height);
+          return 0;
         }
 #endif
       }
@@ -1062,8 +1081,7 @@ int UVScale(const uint8_t* src_uv,
             int dst_height,
             enum FilterMode filtering) {
   if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 ||
-      src_height < -32768 || src_height > 32768 || !dst_uv || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
   return ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv,
@@ -1085,9 +1103,8 @@ int UVScale_16(const uint16_t* src_uv,
                enum FilterMode filtering) {
   int dy = 0;
 
-  if (!src_uv || src_width <= 0 || src_height == 0 || src_height == INT_MIN ||
-      src_width > 32768 || src_height > 32768 || !dst_uv || dst_width <= 0 ||
-      dst_height <= 0) {
+  if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 ||
+      src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
@@ -1099,7 +1116,7 @@ int UVScale_16(const uint16_t* src_uv,
   // Negative src_height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
-    src_uv = src_uv + (src_height - 1) * (ptrdiff_t)src_stride_uv;
+    src_uv = src_uv + (src_height - 1) * (intptr_t)src_stride_uv;
     src_stride_uv = -src_stride_uv;
   }
   src_width = Abs(src_width);
@@ -1107,17 +1124,16 @@ int UVScale_16(const uint16_t* src_uv,
 #ifdef HAS_UVCOPY
   if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) {
     if (dst_height == 1) {
-      return UVCopy_16(
-          src_uv + ((src_height - 1) / 2) * (ptrdiff_t)src_stride_uv,
-          src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height);
+      UVCopy_16(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride_uv,
+                src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height);
+    } else {
+      dy = src_height / dst_height;
+      UVCopy_16(src_uv + ((dy - 1) / 2) * (intptr_t)src_stride_uv,
+                (int)(dy * (intptr_t)src_stride_uv), dst_uv, dst_stride_uv,
+                dst_width, dst_height);
     }
-    dy = src_height / dst_height;
-    if (src_stride_uv > INT_MAX / dy) {
-      return -1;
-    }
-    return UVCopy_16(src_uv + ((dy - 1) / 2) * (ptrdiff_t)src_stride_uv,
-                     dy * src_stride_uv, dst_uv, dst_stride_uv, dst_width,
-                     dst_height);
+
+    return 0;
   }
 #endif
 
diff --git a/source/scale_win.cc b/source/scale_win.cc
index 4b7fd3590..870ed77b3 100644
--- a/source/scale_win.cc
+++ b/source/scale_win.cc
@@ -104,7 +104,7 @@ __declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8       // isolate odd pixels.
+    psrlw      xmm0, 8          // isolate odd pixels.
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -138,7 +138,7 @@ __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
     lea        eax,  [eax + 32]
     pmaddubsw  xmm0, xmm4  // horizontal add
     pmaddubsw  xmm1, xmm4
-    pavgw      xmm0, xmm5    // (x + 1) / 2
+    pavgw      xmm0, xmm5       // (x + 1) / 2
     pavgw      xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -213,7 +213,7 @@ __declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
     vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
     vpsrlw      ymm1, ymm1, 8
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8    // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -249,7 +249,7 @@ __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
     vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
     vpavgw      ymm1, ymm1, ymm5
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8    // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -319,7 +319,7 @@ __declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
     // src_stride ignored
     mov        edx, [esp + 12]  // dst_ptr
     mov        ecx, [esp + 16]  // dst_width
-    pcmpeqb    xmm5, xmm5     // generate mask 0x00ff0000
+    pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
     psrld      xmm5, 24
     pslld      xmm5, 16
 
@@ -424,7 +424,7 @@ __declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
     vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
     vpsrlw      ymm0, ymm0, 8
     vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8    // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], xmm0
     lea         edx, [edx + 16]
     sub         ecx, 16
@@ -687,7 +687,7 @@ __declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
     pshufb     xmm1, xmm5
     paddusb    xmm0, xmm1
 
-    movq       qword ptr [edx], xmm0    // write 12 pixels
+    movq       qword ptr [edx], xmm0       // write 12 pixels
     movhlps    xmm1, xmm0
     movd       [edx + 8], xmm1
     lea        edx, [edx + 12]
@@ -1030,7 +1030,7 @@ __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm0
     shufps     xmm0, xmm1, 0x88  // even pixels
-    shufps     xmm2, xmm1, 0xdd    // odd pixels
+    shufps     xmm2, xmm1, 0xdd       // odd pixels
     pavgb      xmm0, xmm2
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
@@ -1216,7 +1216,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
     test       ecx, 2
     je         xloop29
 
-         // 2 Pixels.
+        // 2 Pixels.
     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
     pextrw     eax, xmm2, 5  // get x2 integer.
@@ -1229,7 +1229,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
     test       ecx, 1
     je         xloop99
 
-         // 1 Pixels.
+        // 1 Pixels.
     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
     movd       dword ptr [edi], xmm0
  xloop99:
diff --git a/unit_test/basictypes_test.cc b/unit_test/basictypes_test.cc
index 421b56f85..9aaa2dcd9 100644
--- a/unit_test/basictypes_test.cc
+++ b/unit_test/basictypes_test.cc
@@ -22,22 +22,22 @@ TEST_F(LibYUVBaseTest, SizeOfTypes) {
   uint32_t u32 = 1u;
   int64_t i64 = -1;
   uint64_t u64 = 1u;
-  ASSERT_EQ(1u, sizeof(i8));
-  ASSERT_EQ(1u, sizeof(u8));
-  ASSERT_EQ(2u, sizeof(i16));
-  ASSERT_EQ(2u, sizeof(u16));
-  ASSERT_EQ(4u, sizeof(i32));
-  ASSERT_EQ(4u, sizeof(u32));
-  ASSERT_EQ(8u, sizeof(i64));
-  ASSERT_EQ(8u, sizeof(u64));
-  ASSERT_GT(0, i8);
-  ASSERT_LT(0u, u8);
-  ASSERT_GT(0, i16);
-  ASSERT_LT(0u, u16);
-  ASSERT_GT(0, i32);
-  ASSERT_LT(0u, u32);
-  ASSERT_GT(0, i64);
-  ASSERT_LT(0u, u64);
+  EXPECT_EQ(1u, sizeof(i8));
+  EXPECT_EQ(1u, sizeof(u8));
+  EXPECT_EQ(2u, sizeof(i16));
+  EXPECT_EQ(2u, sizeof(u16));
+  EXPECT_EQ(4u, sizeof(i32));
+  EXPECT_EQ(4u, sizeof(u32));
+  EXPECT_EQ(8u, sizeof(i64));
+  EXPECT_EQ(8u, sizeof(u64));
+  EXPECT_GT(0, i8);
+  EXPECT_LT(0u, u8);
+  EXPECT_GT(0, i16);
+  EXPECT_LT(0u, u16);
+  EXPECT_GT(0, i32);
+  EXPECT_LT(0u, u32);
+  EXPECT_GT(0, i64);
+  EXPECT_LT(0u, u64);
 }
 
 }  // namespace libyuv
diff --git a/unit_test/color_test.cc b/unit_test/color_test.cc
index 24456a524..01267ff1e 100644
--- a/unit_test/color_test.cc
+++ b/unit_test/color_test.cc
@@ -22,8 +22,14 @@ namespace libyuv {
 
 // TODO(fbarchard): clang x86 has a higher accuracy YUV to RGB.
 // Port to Visual C and other CPUs
+#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || defined(__i386__))
+#define ERROR_FULL 5
+#define ERROR_J420 4
+#else
 #define ERROR_FULL 6
 #define ERROR_J420 6
+#endif
 #define ERROR_R 1
 #define ERROR_G 1
 #ifdef LIBYUV_UNLIMITED_DATA
@@ -113,11 +119,11 @@ namespace libyuv {
     }                                                                          \
     /* Test C and SIMD match. */                                               \
     for (int i = 0; i < kPixels * 4; ++i) {                                    \
-      ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                           \
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                           \
     }                                                                          \
     /* Test SIMD is close to original. */                                      \
     for (int i = 0; i < kPixels * 4; ++i) {                                    \
-      ASSERT_NEAR(static_cast<int>(orig_pixels[i]),                            \
+      EXPECT_NEAR(static_cast<int>(orig_pixels[i]),                            \
                   static_cast<int>(dst_pixels_opt[i]), DIFF);                  \
     }                                                                          \
                                                                                \
@@ -425,16 +431,15 @@ TEST_F(LibYUVColorTest, TestRoundToByte) {
       allb |= b;
     }
   }
-  ASSERT_GE(allb, 0);
-  ASSERT_LE(allb, 255);
+  EXPECT_GE(allb, 0);
+  EXPECT_LE(allb, 255);
 }
 
 // BT.601 limited range YUV to RGB reference
 static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
-  double y1 = (y - 16) * 1.164;
-  *r = RoundToByte(y1 - (v - 128) * -1.596);
-  *g = RoundToByte(y1 - (u - 128) * 0.391 - (v - 128) * 0.813);
-  *b = RoundToByte(y1 - (u - 128) * -2.018);
+  *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596);
+  *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813);
+  *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018);
 }
 
 // BT.601 full range YUV to RGB reference (aka JPEG)
@@ -447,10 +452,9 @@ static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
 // BT.709 limited range YUV to RGB reference
 // See also http://www.equasys.de/colorconversion.html
 static void YUVHToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
-  double y1 = (y - 16) * 1.164;
-  *r = RoundToByte(y1 - (v - 128) * -1.793);
-  *g = RoundToByte(y1 - (u - 128) * 0.213 - (v - 128) * 0.533);
-  *b = RoundToByte(y1 - (u - 128) * -2.112);
+  *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.793);
+  *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.213 - (v - 128) * 0.533);
+  *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.112);
 }
 
 // BT.709 full range YUV to RGB reference
@@ -462,10 +466,10 @@ static void YUVFToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
 
 // BT.2020 limited range YUV to RGB reference
 static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
-  double y1 = (y - 16) * 1.164384;
-  *r = RoundToByte(y1 - (v - 128) * -1.67867);
-  *g = RoundToByte(y1 - (u - 128) * 0.187326 - (v - 128) * 0.65042);
-  *b = RoundToByte(y1 - (u - 128) * -2.14177);
+  *r = RoundToByte((y - 16) * 1.164384 - (v - 128) * -1.67867);
+  *g = RoundToByte((y - 16) * 1.164384 - (u - 128) * 0.187326 -
+                   (v - 128) * 0.65042);
+  *b = RoundToByte((y - 16) * 1.164384 - (u - 128) * -2.14177);
 }
 
 // BT.2020 full range YUV to RGB reference
@@ -480,48 +484,48 @@ TEST_F(LibYUVColorTest, TestYUV) {
 
   // cyan (less red)
   YUVToRGBReference(240, 255, 0, &r0, &g0, &b0);
-  ASSERT_EQ(56, r0);
-  ASSERT_EQ(255, g0);
-  ASSERT_EQ(255, b0);
+  EXPECT_EQ(56, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(255, b0);
 
   YUVToRGB(240, 255, 0, &r1, &g1, &b1);
-  ASSERT_EQ(57, r1);
-  ASSERT_EQ(255, g1);
-  ASSERT_EQ(255, b1);
+  EXPECT_EQ(57, r1);
+  EXPECT_EQ(255, g1);
+  EXPECT_EQ(255, b1);
 
   // green (less red and blue)
   YUVToRGBReference(240, 0, 0, &r0, &g0, &b0);
-  ASSERT_EQ(56, r0);
-  ASSERT_EQ(255, g0);
-  ASSERT_EQ(2, b0);
+  EXPECT_EQ(56, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(2, b0);
 
   YUVToRGB(240, 0, 0, &r1, &g1, &b1);
-  ASSERT_EQ(57, r1);
-  ASSERT_EQ(255, g1);
+  EXPECT_EQ(57, r1);
+  EXPECT_EQ(255, g1);
 #ifdef LIBYUV_UNLIMITED_DATA
-  ASSERT_EQ(3, b1);
+  EXPECT_EQ(3, b1);
 #else
-  ASSERT_EQ(5, b1);
+  EXPECT_EQ(5, b1);
 #endif
 
   for (int i = 0; i < 256; ++i) {
     YUVToRGBReference(i, 128, 128, &r0, &g0, &b0);
     YUVToRGB(i, 128, 128, &r1, &g1, &b1);
-    ASSERT_NEAR(r0, r1, ERROR_R);
-    ASSERT_NEAR(g0, g1, ERROR_G);
-    ASSERT_NEAR(b0, b1, ERROR_B);
+    EXPECT_NEAR(r0, r1, ERROR_R);
+    EXPECT_NEAR(g0, g1, ERROR_G);
+    EXPECT_NEAR(b0, b1, ERROR_B);
 
     YUVToRGBReference(i, 0, 0, &r0, &g0, &b0);
     YUVToRGB(i, 0, 0, &r1, &g1, &b1);
-    ASSERT_NEAR(r0, r1, ERROR_R);
-    ASSERT_NEAR(g0, g1, ERROR_G);
-    ASSERT_NEAR(b0, b1, ERROR_B);
+    EXPECT_NEAR(r0, r1, ERROR_R);
+    EXPECT_NEAR(g0, g1, ERROR_G);
+    EXPECT_NEAR(b0, b1, ERROR_B);
 
     YUVToRGBReference(i, 0, 255, &r0, &g0, &b0);
     YUVToRGB(i, 0, 255, &r1, &g1, &b1);
-    ASSERT_NEAR(r0, r1, ERROR_R);
-    ASSERT_NEAR(g0, g1, ERROR_G);
-    ASSERT_NEAR(b0, b1, ERROR_B);
+    EXPECT_NEAR(r0, r1, ERROR_R);
+    EXPECT_NEAR(g0, g1, ERROR_G);
+    EXPECT_NEAR(b0, b1, ERROR_B);
   }
 }
 
@@ -530,47 +534,47 @@ TEST_F(LibYUVColorTest, TestGreyYUV) {
 
   // black
   YUVToRGBReference(16, 128, 128, &r0, &g0, &b0);
-  ASSERT_EQ(0, r0);
-  ASSERT_EQ(0, g0);
-  ASSERT_EQ(0, b0);
+  EXPECT_EQ(0, r0);
+  EXPECT_EQ(0, g0);
+  EXPECT_EQ(0, b0);
 
   YUVToRGB(16, 128, 128, &r1, &g1, &b1);
-  ASSERT_EQ(0, r1);
-  ASSERT_EQ(0, g1);
-  ASSERT_EQ(0, b1);
+  EXPECT_EQ(0, r1);
+  EXPECT_EQ(0, g1);
+  EXPECT_EQ(0, b1);
 
   // white
   YUVToRGBReference(240, 128, 128, &r0, &g0, &b0);
-  ASSERT_EQ(255, r0);
-  ASSERT_EQ(255, g0);
-  ASSERT_EQ(255, b0);
+  EXPECT_EQ(255, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(255, b0);
 
   YUVToRGB(240, 128, 128, &r1, &g1, &b1);
-  ASSERT_EQ(255, r1);
-  ASSERT_EQ(255, g1);
-  ASSERT_EQ(255, b1);
+  EXPECT_EQ(255, r1);
+  EXPECT_EQ(255, g1);
+  EXPECT_EQ(255, b1);
 
   // grey
   YUVToRGBReference(128, 128, 128, &r0, &g0, &b0);
-  ASSERT_EQ(130, r0);
-  ASSERT_EQ(130, g0);
-  ASSERT_EQ(130, b0);
+  EXPECT_EQ(130, r0);
+  EXPECT_EQ(130, g0);
+  EXPECT_EQ(130, b0);
 
   YUVToRGB(128, 128, 128, &r1, &g1, &b1);
-  ASSERT_EQ(130, r1);
-  ASSERT_EQ(130, g1);
-  ASSERT_EQ(130, b1);
+  EXPECT_EQ(130, r1);
+  EXPECT_EQ(130, g1);
+  EXPECT_EQ(130, b1);
 
   for (int y = 0; y < 256; ++y) {
     YUVToRGBReference(y, 128, 128, &r0, &g0, &b0);
     YUVToRGB(y, 128, 128, &r1, &g1, &b1);
     YToRGB(y, &r2, &g2, &b2);
-    ASSERT_EQ(r0, r1);
-    ASSERT_EQ(g0, g1);
-    ASSERT_EQ(b0, b1);
-    ASSERT_EQ(r0, r2);
-    ASSERT_EQ(g0, g2);
-    ASSERT_EQ(b0, b2);
+    EXPECT_EQ(r0, r1);
+    EXPECT_EQ(g0, g1);
+    EXPECT_EQ(b0, b1);
+    EXPECT_EQ(r0, r2);
+    EXPECT_EQ(g0, g2);
+    EXPECT_EQ(b0, b2);
   }
 }
 
@@ -608,11 +612,10 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
 #ifdef DISABLE_SLOW_TESTS
 #define FASTSTEP 5
 #else
-#define FASTSTEP 3
+#define FASTSTEP 1
 #endif
 
 // BT.601 limited range.
-#ifndef DISABLE_SLOW_TESTS
 TEST_F(LibYUVColorTest, TestFullYUV) {
   int rh[256] = {
       0,
@@ -623,16 +626,16 @@ TEST_F(LibYUVColorTest, TestFullYUV) {
   int bh[256] = {
       0,
   };
-  for (int u = 0; u < 256; u += FASTSTEP) {
-    for (int v = 0; v < 256; v += FASTSTEP) {
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
         int r0, g0, b0, r1, g1, b1;
         int y = RANDOM256(y2);
         YUVToRGBReference(y, u, v, &r0, &g0, &b0);
         YUVToRGB(y, u, v, &r1, &g1, &b1);
-        ASSERT_NEAR(r0, r1, ERROR_R);
-        ASSERT_NEAR(g0, g1, ERROR_G);
-        ASSERT_NEAR(b0, b1, ERROR_B);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
         ++rh[r1 - r0 + 128];
         ++gh[g1 - g0 + 128];
         ++bh[b1 - b0 + 128];
@@ -653,16 +656,16 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) {
   int bh[256] = {
       0,
   };
-  for (int u = 0; u < 256; u += FASTSTEP) {
-    for (int v = 0; v < 256; v += FASTSTEP) {
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
         int r0, g0, b0, r1, g1, b1;
         int y = RANDOM256(y2);
         YUVJToRGBReference(y, u, v, &r0, &g0, &b0);
         YUVJToRGB(y, u, v, &r1, &g1, &b1);
-        ASSERT_NEAR(r0, r1, ERROR_R);
-        ASSERT_NEAR(g0, g1, ERROR_G);
-        ASSERT_NEAR(b0, b1, ERROR_B);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
         ++rh[r1 - r0 + 128];
         ++gh[g1 - g0 + 128];
         ++bh[b1 - b0 + 128];
@@ -683,16 +686,16 @@ TEST_F(LibYUVColorTest, TestFullYUVH) {
   int bh[256] = {
       0,
   };
-  for (int u = 0; u < 256; u += FASTSTEP) {
-    for (int v = 0; v < 256; v += FASTSTEP) {
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
         int r0, g0, b0, r1, g1, b1;
         int y = RANDOM256(y2);
         YUVHToRGBReference(y, u, v, &r0, &g0, &b0);
         YUVHToRGB(y, u, v, &r1, &g1, &b1);
-        ASSERT_NEAR(r0, r1, ERROR_R);
-        ASSERT_NEAR(g0, g1, ERROR_G);
-        ASSERT_NEAR(b0, b1, ERROR_B);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
         ++rh[r1 - r0 + 128];
         ++gh[g1 - g0 + 128];
         ++bh[b1 - b0 + 128];
@@ -713,16 +716,16 @@ TEST_F(LibYUVColorTest, TestFullYUVF) {
   int bh[256] = {
       0,
   };
-  for (int u = 0; u < 256; u += FASTSTEP) {
-    for (int v = 0; v < 256; v += FASTSTEP) {
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
         int r0, g0, b0, r1, g1, b1;
         int y = RANDOM256(y2);
         YUVFToRGBReference(y, u, v, &r0, &g0, &b0);
         YUVFToRGB(y, u, v, &r1, &g1, &b1);
-        ASSERT_NEAR(r0, r1, ERROR_R);
-        ASSERT_NEAR(g0, g1, ERROR_G);
-        ASSERT_NEAR(b0, b1, ERROR_B);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
         ++rh[r1 - r0 + 128];
         ++gh[g1 - g0 + 128];
         ++bh[b1 - b0 + 128];
@@ -743,16 +746,16 @@ TEST_F(LibYUVColorTest, TestFullYUVU) {
   int bh[256] = {
       0,
   };
-  for (int u = 0; u < 256; u += FASTSTEP) {
-    for (int v = 0; v < 256; v += FASTSTEP) {
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
         int r0, g0, b0, r1, g1, b1;
         int y = RANDOM256(y2);
         YUVUToRGBReference(y, u, v, &r0, &g0, &b0);
         YUVUToRGB(y, u, v, &r1, &g1, &b1);
-        ASSERT_NEAR(r0, r1, ERROR_R);
-        ASSERT_NEAR(g0, g1, ERROR_G);
-        ASSERT_NEAR(b0, b1, ERROR_B);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
         ++rh[r1 - r0 + 128];
         ++gh[g1 - g0 + 128];
         ++bh[b1 - b0 + 128];
@@ -773,16 +776,16 @@ TEST_F(LibYUVColorTest, TestFullYUVV) {
   int bh[256] = {
       0,
   };
-  for (int u = 0; u < 256; u += FASTSTEP) {
-    for (int v = 0; v < 256; v += FASTSTEP) {
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
         int r0, g0, b0, r1, g1, b1;
         int y = RANDOM256(y2);
         YUVVToRGBReference(y, u, v, &r0, &g0, &b0);
         YUVVToRGB(y, u, v, &r1, &g1, &b1);
-        ASSERT_NEAR(r0, r1, ERROR_R);
-        ASSERT_NEAR(g0, g1, 2);
-        ASSERT_NEAR(b0, b1, ERROR_B);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, 2);
+        EXPECT_NEAR(b0, b1, ERROR_B);
         ++rh[r1 - r0 + 128];
         ++gh[g1 - g0 + 128];
         ++bh[b1 - b0 + 128];
@@ -791,8 +794,6 @@ TEST_F(LibYUVColorTest, TestFullYUVV) {
   }
   PrintHistogram(rh, gh, bh);
 }
-#endif  // DISABLE_SLOW_TESTS
-
 #undef FASTSTEP
 
 TEST_F(LibYUVColorTest, TestGreyYUVJ) {
@@ -800,47 +801,47 @@ TEST_F(LibYUVColorTest, TestGreyYUVJ) {
 
   // black
   YUVJToRGBReference(0, 128, 128, &r0, &g0, &b0);
-  ASSERT_EQ(0, r0);
-  ASSERT_EQ(0, g0);
-  ASSERT_EQ(0, b0);
+  EXPECT_EQ(0, r0);
+  EXPECT_EQ(0, g0);
+  EXPECT_EQ(0, b0);
 
   YUVJToRGB(0, 128, 128, &r1, &g1, &b1);
-  ASSERT_EQ(0, r1);
-  ASSERT_EQ(0, g1);
-  ASSERT_EQ(0, b1);
+  EXPECT_EQ(0, r1);
+  EXPECT_EQ(0, g1);
+  EXPECT_EQ(0, b1);
 
   // white
   YUVJToRGBReference(255, 128, 128, &r0, &g0, &b0);
-  ASSERT_EQ(255, r0);
-  ASSERT_EQ(255, g0);
-  ASSERT_EQ(255, b0);
+  EXPECT_EQ(255, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(255, b0);
 
   YUVJToRGB(255, 128, 128, &r1, &g1, &b1);
-  ASSERT_EQ(255, r1);
-  ASSERT_EQ(255, g1);
-  ASSERT_EQ(255, b1);
+  EXPECT_EQ(255, r1);
+  EXPECT_EQ(255, g1);
+  EXPECT_EQ(255, b1);
 
   // grey
   YUVJToRGBReference(128, 128, 128, &r0, &g0, &b0);
-  ASSERT_EQ(128, r0);
-  ASSERT_EQ(128, g0);
-  ASSERT_EQ(128, b0);
+  EXPECT_EQ(128, r0);
+  EXPECT_EQ(128, g0);
+  EXPECT_EQ(128, b0);
 
   YUVJToRGB(128, 128, 128, &r1, &g1, &b1);
-  ASSERT_EQ(128, r1);
-  ASSERT_EQ(128, g1);
-  ASSERT_EQ(128, b1);
+  EXPECT_EQ(128, r1);
+  EXPECT_EQ(128, g1);
+  EXPECT_EQ(128, b1);
 
   for (int y = 0; y < 256; ++y) {
     YUVJToRGBReference(y, 128, 128, &r0, &g0, &b0);
     YUVJToRGB(y, 128, 128, &r1, &g1, &b1);
     YJToRGB(y, &r2, &g2, &b2);
-    ASSERT_EQ(r0, r1);
-    ASSERT_EQ(g0, g1);
-    ASSERT_EQ(b0, b1);
-    ASSERT_EQ(r0, r2);
-    ASSERT_EQ(g0, g2);
-    ASSERT_EQ(b0, b2);
+    EXPECT_EQ(r0, r1);
+    EXPECT_EQ(g0, g1);
+    EXPECT_EQ(b0, b1);
+    EXPECT_EQ(r0, r2);
+    EXPECT_EQ(g0, g2);
+    EXPECT_EQ(b0, b2);
   }
 }
 
diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc
index 8ae17f545..c29562cb8 100644
--- a/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
@@ -48,7 +48,7 @@ TEST_F(LibYUVCompareTest, Djb2_Test) {
       " together with Hermann Zapf";
   uint32_t foxhash = HashDjb2(reinterpret_cast<const uint8_t*>(fox), 131, 5381);
   const uint32_t kExpectedFoxHash = 2611006483u;
-  ASSERT_EQ(kExpectedFoxHash, foxhash);
+  EXPECT_EQ(kExpectedFoxHash, foxhash);
 
   for (int i = 0; i < kMaxTest; ++i) {
     src_a[i] = (fastrand() & 0xff);
@@ -57,13 +57,13 @@ TEST_F(LibYUVCompareTest, Djb2_Test) {
   // Compare different buffers. Expect hash is different.
   uint32_t h1 = HashDjb2(src_a, kMaxTest, 5381);
   uint32_t h2 = HashDjb2(src_b, kMaxTest, 5381);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);
 
   // Make last half same. Expect hash is different.
   memcpy(src_a + kMaxTest / 2, src_b + kMaxTest / 2, kMaxTest / 2);
   h1 = HashDjb2(src_a, kMaxTest, 5381);
   h2 = HashDjb2(src_b, kMaxTest, 5381);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);
 
   // Make first half same. Expect hash is different.
   memcpy(src_a + kMaxTest / 2, src_a, kMaxTest / 2);
@@ -71,52 +71,52 @@ TEST_F(LibYUVCompareTest, Djb2_Test) {
   memcpy(src_a, src_b, kMaxTest / 2);
   h1 = HashDjb2(src_a, kMaxTest, 5381);
   h2 = HashDjb2(src_b, kMaxTest, 5381);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);
 
   // Make same. Expect hash is same.
   memcpy(src_a, src_b, kMaxTest);
   h1 = HashDjb2(src_a, kMaxTest, 5381);
   h2 = HashDjb2(src_b, kMaxTest, 5381);
-  ASSERT_EQ(h1, h2);
+  EXPECT_EQ(h1, h2);
 
   // Mask seed different. Expect hash is different.
   memcpy(src_a, src_b, kMaxTest);
   h1 = HashDjb2(src_a, kMaxTest, 5381);
   h2 = HashDjb2(src_b, kMaxTest, 1234);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);
 
   // Make one byte different in middle. Expect hash is different.
   memcpy(src_a, src_b, kMaxTest);
   ++src_b[kMaxTest / 2];
   h1 = HashDjb2(src_a, kMaxTest, 5381);
   h2 = HashDjb2(src_b, kMaxTest, 5381);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);
 
   // Make first byte different. Expect hash is different.
   memcpy(src_a, src_b, kMaxTest);
   ++src_b[0];
   h1 = HashDjb2(src_a, kMaxTest, 5381);
   h2 = HashDjb2(src_b, kMaxTest, 5381);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);
 
   // Make last byte different. Expect hash is different.
   memcpy(src_a, src_b, kMaxTest);
   ++src_b[kMaxTest - 1];
   h1 = HashDjb2(src_a, kMaxTest, 5381);
   h2 = HashDjb2(src_b, kMaxTest, 5381);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);
 
   // Make a zeros. Test different lengths. Expect hash is different.
   memset(src_a, 0, kMaxTest);
   h1 = HashDjb2(src_a, kMaxTest, 5381);
   h2 = HashDjb2(src_a, kMaxTest / 2, 5381);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);
 
   // Make a zeros and seed of zero. Test different lengths. Expect hash is same.
   memset(src_a, 0, kMaxTest);
   h1 = HashDjb2(src_a, kMaxTest, 0);
   h2 = HashDjb2(src_a, kMaxTest / 2, 0);
-  ASSERT_EQ(h1, h2);
+  EXPECT_EQ(h1, h2);
 
   free_aligned_buffer_page_end(src_a);
   free_aligned_buffer_page_end(src_b);
@@ -134,7 +134,7 @@ TEST_F(LibYUVCompareTest, BenchmarkDjb2_Opt) {
   for (int i = 0; i < benchmark_iterations_; ++i) {
     h1 = HashDjb2(src_a, kMaxTest, 5381);
   }
-  ASSERT_EQ(h1, h2);
+  EXPECT_EQ(h1, h2);
   free_aligned_buffer_page_end(src_a);
 }
 
@@ -149,7 +149,7 @@ TEST_F(LibYUVCompareTest, BenchmarkDjb2_Unaligned) {
   for (int i = 0; i < benchmark_iterations_; ++i) {
     h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
   }
-  ASSERT_EQ(h1, h2);
+  EXPECT_EQ(h1, h2);
   free_aligned_buffer_page_end(src_a);
 }
 
@@ -164,19 +164,19 @@ TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Opt) {
   src_a[0] = 0;
   fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
                       benchmark_height_);
-  ASSERT_EQ(static_cast<uint32_t>(libyuv::FOURCC_BGRA), fourcc);
+  EXPECT_EQ(static_cast<uint32_t>(libyuv::FOURCC_BGRA), fourcc);
   src_a[0] = 255;
   src_a[3] = 0;
   fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
                       benchmark_height_);
-  ASSERT_EQ(static_cast<uint32_t>(libyuv::FOURCC_ARGB), fourcc);
+  EXPECT_EQ(static_cast<uint32_t>(libyuv::FOURCC_ARGB), fourcc);
   src_a[3] = 255;
 
   for (int i = 0; i < benchmark_iterations_; ++i) {
     fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
                         benchmark_height_);
   }
-  ASSERT_EQ(0u, fourcc);
+  EXPECT_EQ(0u, fourcc);
 
   free_aligned_buffer_page_end(src_a);
 }
@@ -192,19 +192,19 @@ TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Unaligned) {
   src_a[0 + 1] = 0;
   fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
                       benchmark_height_);
-  ASSERT_EQ(static_cast<uint32_t>(libyuv::FOURCC_BGRA), fourcc);
+  EXPECT_EQ(static_cast<uint32_t>(libyuv::FOURCC_BGRA), fourcc);
   src_a[0 + 1] = 255;
   src_a[3 + 1] = 0;
   fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
                       benchmark_height_);
-  ASSERT_EQ(static_cast<uint32_t>(libyuv::FOURCC_ARGB), fourcc);
+  EXPECT_EQ(static_cast<uint32_t>(libyuv::FOURCC_ARGB), fourcc);
   src_a[3 + 1] = 255;
 
   for (int i = 0; i < benchmark_iterations_; ++i) {
     fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
                         benchmark_height_);
   }
-  ASSERT_EQ(0u, fourcc);
+  EXPECT_EQ(0u, fourcc);
 
   free_aligned_buffer_page_end(src_a);
 }
@@ -221,7 +221,7 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
   memcpy(src_a, "test0123test4567", 16);
   memcpy(src_b, "tick0123tock4567", 16);
   uint32_t h1 = HammingDistance_C(src_a, src_b, 16);
-  ASSERT_EQ(16u, h1);
+  EXPECT_EQ(16u, h1);
 
   // Test C vs OPT on random buffer
   MemRandomize(src_a, kMaxWidth);
@@ -263,7 +263,7 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
     h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
 #endif
   }
-  ASSERT_EQ(h0, h1);
+  EXPECT_EQ(h0, h1);
 
   free_aligned_buffer_page_end(src_a);
   free_aligned_buffer_page_end(src_b);
@@ -280,7 +280,7 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_C) {
   memcpy(src_a, "test0123test4567", 16);
   memcpy(src_b, "tick0123tock4567", 16);
   uint32_t h1 = HammingDistance_C(src_a, src_b, 16);
-  ASSERT_EQ(16u, h1);
+  EXPECT_EQ(16u, h1);
 
   // Test C vs OPT on random buffer
   MemRandomize(src_a, kMaxWidth);
@@ -295,7 +295,7 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_C) {
     h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
   }
 
-  ASSERT_EQ(h0, h1);
+  EXPECT_EQ(h0, h1);
 
   free_aligned_buffer_page_end(src_a);
   free_aligned_buffer_page_end(src_b);
@@ -311,7 +311,7 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance) {
   memcpy(src_a, "test0123test4567", 16);
   memcpy(src_b, "tick0123tock4567", 16);
   uint64_t h1 = ComputeHammingDistance(src_a, src_b, 16);
-  ASSERT_EQ(16u, h1);
+  EXPECT_EQ(16u, h1);
 
   // Test C vs OPT on random buffer
   MemRandomize(src_a, kMaxWidth);
@@ -326,7 +326,7 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance) {
     h1 = ComputeHammingDistance(src_a, src_b, kMaxWidth);
   }
 
-  ASSERT_EQ(h0, h1);
+  EXPECT_EQ(h0, h1);
 
   free_aligned_buffer_page_end(src_a);
   free_aligned_buffer_page_end(src_b);
@@ -351,7 +351,7 @@ TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
   memset(src_b, 0u, kMaxWidth);
 
   uint64_t h0 = ComputeHammingDistance(src_a, src_b, kMaxWidth);
-  ASSERT_EQ(kMaxWidth * 8ULL, h0);
+  EXPECT_EQ(kMaxWidth * 8ULL, h0);
 
   for (int i = 0; i < benchmark_iterations_; ++i) {
 #if defined(HAS_HAMMINGDISTANCE_NEON)
@@ -389,7 +389,7 @@ TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
   // result can not be expected to be correct.
   // TODO(fbarchard): Consider expecting the low 16 bits to match.
   if (kMaxWidth <= kMaxOptCount) {
-    ASSERT_EQ(kMaxWidth * 8U, h1);
+    EXPECT_EQ(kMaxWidth * 8U, h1);
   } else {
     if (kMaxWidth * 8ULL != static_cast<uint64_t>(h1)) {
       printf(
@@ -420,7 +420,7 @@ TEST_F(LibYUVCompareTest, TestHammingDistance) {
     h1 = ComputeHammingDistance(src_a, src_b,
                                 benchmark_width_ * benchmark_height_);
   }
-  ASSERT_EQ(benchmark_width_ * benchmark_height_ * 8ULL, h1);
+  EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8ULL, h1);
 
   free_aligned_buffer_page_end(src_a);
   free_aligned_buffer_page_end(src_b);
@@ -436,7 +436,7 @@ TEST_F(LibYUVCompareTest, BenchmarkSumSquareError_Opt) {
   memcpy(src_a, "test0123test4567", 16);
   memcpy(src_b, "tick0123tock4567", 16);
   uint64_t h1 = ComputeSumSquareError(src_a, src_b, 16);
-  ASSERT_EQ(790u, h1);
+  EXPECT_EQ(790u, h1);
 
   for (int i = 0; i < kMaxWidth; ++i) {
     src_a[i] = i;
@@ -452,7 +452,7 @@ TEST_F(LibYUVCompareTest, BenchmarkSumSquareError_Opt) {
     h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth);
   }
 
-  ASSERT_EQ(0u, h1);
+  EXPECT_EQ(0u, h1);
 
   free_aligned_buffer_page_end(src_a);
   free_aligned_buffer_page_end(src_b);
@@ -468,18 +468,18 @@ TEST_F(LibYUVCompareTest, SumSquareError) {
   uint64_t err;
   err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
 
-  ASSERT_EQ(0u, err);
+  EXPECT_EQ(0u, err);
 
   memset(src_a, 1, kMaxWidth);
   err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
 
-  ASSERT_EQ(static_cast<int>(err), kMaxWidth);
+  EXPECT_EQ(static_cast<int>(err), kMaxWidth);
 
   memset(src_a, 190, kMaxWidth);
   memset(src_b, 193, kMaxWidth);
   err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
 
-  ASSERT_EQ(static_cast<int>(err), kMaxWidth * 3 * 3);
+  EXPECT_EQ(static_cast<int>(err), kMaxWidth * 3 * 3);
 
   for (int i = 0; i < kMaxWidth; ++i) {
     src_a[i] = (fastrand() & 0xff);
@@ -492,7 +492,7 @@ TEST_F(LibYUVCompareTest, SumSquareError) {
   MaskCpuFlags(benchmark_cpu_info_);
   uint64_t opt_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
 
-  ASSERT_EQ(c_err, opt_err);
+  EXPECT_EQ(c_err, opt_err);
 
   free_aligned_buffer_page_end(src_a);
   free_aligned_buffer_page_end(src_b);
@@ -517,7 +517,7 @@ TEST_F(LibYUVCompareTest, BenchmarkPsnr_Opt) {
   opt_time = (get_time() - opt_time) / benchmark_iterations_;
   printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
 
-  ASSERT_EQ(0, 0);
+  EXPECT_EQ(0, 0);
 
   free_aligned_buffer_page_end(src_a);
   free_aligned_buffer_page_end(src_b);
@@ -542,7 +542,7 @@ TEST_F(LibYUVCompareTest, BenchmarkPsnr_Unaligned) {
   opt_time = (get_time() - opt_time) / benchmark_iterations_;
   printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
 
-  ASSERT_EQ(0, 0);
+  EXPECT_EQ(0, 0);
 
   free_aligned_buffer_page_end(src_a);
   free_aligned_buffer_page_end(src_b);
@@ -564,7 +564,7 @@ TEST_F(LibYUVCompareTest, Psnr) {
                       src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
                       kSrcHeight);
 
-  ASSERT_EQ(err, kMaxPsnr);
+  EXPECT_EQ(err, kMaxPsnr);
 
   memset(src_a, 255, kSrcPlaneSize);
 
@@ -572,7 +572,7 @@ TEST_F(LibYUVCompareTest, Psnr) {
                       src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
                       kSrcHeight);
 
-  ASSERT_EQ(err, 0.0);
+  EXPECT_EQ(err, 0.0);
 
   memset(src_a, 1, kSrcPlaneSize);
 
@@ -580,8 +580,8 @@ TEST_F(LibYUVCompareTest, Psnr) {
                       src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
                       kSrcHeight);
 
-  ASSERT_GT(err, 48.0);
-  ASSERT_LT(err, 49.0);
+  EXPECT_GT(err, 48.0);
+  EXPECT_LT(err, 49.0);
 
   for (int i = 0; i < kSrcPlaneSize; ++i) {
     src_a[i] = i;
@@ -591,9 +591,9 @@ TEST_F(LibYUVCompareTest, Psnr) {
                       src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
                       kSrcHeight);
 
-  ASSERT_GT(err, 2.0);
+  EXPECT_GT(err, 2.0);
   if (kSrcWidth * kSrcHeight >= 256) {
-    ASSERT_LT(err, 6.0);
+    EXPECT_LT(err, 6.0);
   }
 
   memset(src_a, 0, kSrcPlaneSize);
@@ -619,7 +619,7 @@ TEST_F(LibYUVCompareTest, Psnr) {
                           src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
                           kSrcHeight);
 
-  ASSERT_EQ(opt_err, c_err);
+  EXPECT_EQ(opt_err, c_err);
 
   free_aligned_buffer_page_end(src_a);
   free_aligned_buffer_page_end(src_b);
@@ -644,7 +644,7 @@ TEST_F(LibYUVCompareTest, DISABLED_BenchmarkSsim_Opt) {
   opt_time = (get_time() - opt_time) / benchmark_iterations_;
   printf("BenchmarkSsim_Opt - %8.2f us opt\n", opt_time * 1e6);
 
-  ASSERT_EQ(0, 0);  // Pass if we get this far.
+  EXPECT_EQ(0, 0);  // Pass if we get this far.
 
   free_aligned_buffer_page_end(src_a);
   free_aligned_buffer_page_end(src_b);
@@ -671,7 +671,7 @@ TEST_F(LibYUVCompareTest, Ssim) {
                       kSrcHeight);
 
   if (kSrcWidth > 8 && kSrcHeight > 8) {
-    ASSERT_EQ(err, 1.0);
+    EXPECT_EQ(err, 1.0);
   }
 
   memset(src_a, 255, kSrcPlaneSize);
@@ -681,7 +681,7 @@ TEST_F(LibYUVCompareTest, Ssim) {
                       kSrcHeight);
 
   if (kSrcWidth > 8 && kSrcHeight > 8) {
-    ASSERT_LT(err, 0.0001);
+    EXPECT_LT(err, 0.0001);
   }
 
   memset(src_a, 1, kSrcPlaneSize);
@@ -691,8 +691,8 @@ TEST_F(LibYUVCompareTest, Ssim) {
                       kSrcHeight);
 
   if (kSrcWidth > 8 && kSrcHeight > 8) {
-    ASSERT_GT(err, 0.0001);
-    ASSERT_LT(err, 0.9);
+    EXPECT_GT(err, 0.0001);
+    EXPECT_LT(err, 0.9);
   }
 
   for (int i = 0; i < kSrcPlaneSize; ++i) {
@@ -704,8 +704,8 @@ TEST_F(LibYUVCompareTest, Ssim) {
                       kSrcHeight);
 
   if (kSrcWidth > 8 && kSrcHeight > 8) {
-    ASSERT_GT(err, 0.0);
-    ASSERT_LT(err, 0.01);
+    EXPECT_GT(err, 0.0);
+    EXPECT_LT(err, 0.01);
   }
 
   for (int i = b; i < (kSrcHeight + b); ++i) {
@@ -729,7 +729,7 @@ TEST_F(LibYUVCompareTest, Ssim) {
                           kSrcHeight);
 
   if (kSrcWidth > 8 && kSrcHeight > 8) {
-    ASSERT_EQ(opt_err, c_err);
+    EXPECT_EQ(opt_err, c_err);
   }
 
   free_aligned_buffer_page_end(src_a);
diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc
index 7f545a435..e309b38bb 100644
--- a/unit_test/convert_argb_test.cc
+++ b/unit_test/convert_argb_test.cc
@@ -53,9 +53,9 @@ namespace libyuv {
 #define ABGRToABGR ARGBCopy
 
 // subsample amount uses a divide.
-#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
 
-#define ALIGNINT(V, ALIGN) (((V) + (ALIGN) - 1) / (ALIGN) * (ALIGN))
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
 
 #define TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
@@ -82,19 +82,15 @@ namespace libyuv {
         (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                    \
     const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);    \
     const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y);  \
-    align_buffer_page_end(src_y,                                               \
-                          kPaddedWidth * kPaddedHeight * SRC_BPC + OFF);       \
+    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF);  \
     align_buffer_page_end(                                                     \
-        src_uv,                                                                \
-        kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC * 2 + OFF);       \
-    align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC);                \
-    align_buffer_page_end(dst_u_c, kDstHalfWidth * kDstHalfHeight * DST_BPC);  \
-    align_buffer_page_end(dst_v_c, kDstHalfWidth * kDstHalfHeight * DST_BPC);  \
-    align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC);              \
-    align_buffer_page_end(dst_u_opt,                                           \
-                          kDstHalfWidth * kDstHalfHeight * DST_BPC);           \
-    align_buffer_page_end(dst_v_opt,                                           \
-                          kDstHalfWidth * kDstHalfHeight * DST_BPC);           \
+        src_uv, kSrcHalfPaddedWidth* kSrcHalfPaddedHeight* SRC_BPC * 2 + OFF); \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                  \
+    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);    \
+    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);    \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);                \
+    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC);  \
+    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC);  \
     SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                    \
     SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                  \
     for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) {                   \
@@ -105,12 +101,12 @@ namespace libyuv {
       src_uv_p[i] =                                                            \
           (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
     }                                                                          \
-    memset(dst_y_c, 1, kWidth * kHeight * DST_BPC);                            \
-    memset(dst_u_c, 2, kDstHalfWidth * kDstHalfHeight * DST_BPC);              \
-    memset(dst_v_c, 3, kDstHalfWidth * kDstHalfHeight * DST_BPC);              \
-    memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC);                        \
-    memset(dst_u_opt, 102, kDstHalfWidth * kDstHalfHeight * DST_BPC);          \
-    memset(dst_v_opt, 103, kDstHalfWidth * kDstHalfHeight * DST_BPC);          \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                              \
+    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);                \
+    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);                \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                          \
+    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);            \
+    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);            \
     MaskCpuFlags(disable_cpu_flags_);                                          \
     SRC_FMT_PLANAR##To##FMT_PLANAR(                                            \
         src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2,                          \
@@ -128,11 +124,11 @@ namespace libyuv {
           NEG kHeight);                                                        \
     }                                                                          \
     for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                     \
-      ASSERT_EQ(dst_y_c[i], dst_y_opt[i]);                                     \
+      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                     \
     }                                                                          \
     for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) {       \
-      ASSERT_EQ(dst_u_c[i], dst_u_opt[i]);                                     \
-      ASSERT_EQ(dst_v_c[i], dst_v_opt[i]);                                     \
+      EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);                                     \
+      EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);                                     \
     }                                                                          \
     free_aligned_buffer_page_end(dst_y_c);                                     \
     free_aligned_buffer_page_end(dst_u_c);                                     \
@@ -227,11 +223,11 @@ TESTBPTOP(P012, uint16_t, 2, 2, 2, I012, uint16_t, 2, 2, 2, 12, 1, 1)
     const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                     \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
     align_buffer_page_end(src_u, kSizeUV + OFF);                              \
     align_buffer_page_end(src_v, kSizeUV + OFF);                              \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);              \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);            \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);             \
     for (int i = 0; i < kWidth * kHeight; ++i) {                              \
       src_y[i + OFF] = (fastrand() & 0xff);                                   \
     }                                                                         \
@@ -258,7 +254,7 @@ TESTBPTOP(P012, uint16_t, 2, 2, 2, I012, uint16_t, 2, 2, 2, 12, 1, 1)
            static_cast<int>((time1 - time0) * 1e6),                           \
            static_cast<int>((time2 - time1) * 1e6 / benchmark_iterations_));  \
     for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                      \
-      ASSERT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                  \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                  \
     }                                                                         \
     free_aligned_buffer_page_end(src_y);                                      \
     free_aligned_buffer_page_end(src_u);                                      \
@@ -385,58 +381,58 @@ TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
 #endif
 
-#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,     \
-                   W1280, N, NEG, OFF)                                        \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                       \
-    const int kWidth = W1280;                                                 \
-    const int kHeight = benchmark_height_;                                    \
-    const int kStrideB = kWidth * BPP_B;                                      \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
-    align_buffer_page_end(                                                    \
-        src_uv, kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF);         \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight);                    \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight);                  \
-    for (int i = 0; i < kHeight; ++i)                                         \
-      for (int j = 0; j < kWidth; ++j)                                        \
-        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                    \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < kStrideUV * 2; ++j) {                               \
-        src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff);            \
-      }                                                                       \
-    }                                                                         \
-    memset(dst_argb_c, 1, kStrideB * kHeight);                                \
-    memset(dst_argb_opt, 101, kStrideB * kHeight);                            \
-    MaskCpuFlags(disable_cpu_flags_);                                         \
-    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,   \
-                          dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight);   \
-    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \
-                            dst_argb_opt, kWidth * BPP_B, kWidth,             \
-                            NEG kHeight);                                     \
-    }                                                                         \
-    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */   \
-    align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight);                \
-    align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight);              \
-    memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                            \
-    memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                        \
-    FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,     \
-                  kHeight);                                                   \
-    FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \
-                  kHeight);                                                   \
-    for (int i = 0; i < kHeight; ++i) {                                       \
-      for (int j = 0; j < kWidth * 4; ++j) {                                  \
-        ASSERT_EQ(dst_argb32_c[i * kWidth * 4 + j],                           \
-                  dst_argb32_opt[i * kWidth * 4 + j]);                        \
-      }                                                                       \
-    }                                                                         \
-    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_uv);                                     \
-    free_aligned_buffer_page_end(dst_argb_c);                                 \
-    free_aligned_buffer_page_end(dst_argb_opt);                               \
-    free_aligned_buffer_page_end(dst_argb32_c);                               \
-    free_aligned_buffer_page_end(dst_argb32_opt);                             \
+#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   W1280, N, NEG, OFF)                                         \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = benchmark_height_;                                     \
+    const int kStrideB = kWidth * BPP_B;                                       \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_uv,                                              \
+                          kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight);                      \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight);                    \
+    for (int i = 0; i < kHeight; ++i)                                          \
+      for (int j = 0; j < kWidth; ++j)                                         \
+        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                     \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
+      for (int j = 0; j < kStrideUV * 2; ++j) {                                \
+        src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff);             \
+      }                                                                        \
+    }                                                                          \
+    memset(dst_argb_c, 1, kStrideB* kHeight);                                  \
+    memset(dst_argb_opt, 101, kStrideB* kHeight);                              \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,    \
+                          dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight);    \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,  \
+                            dst_argb_opt, kWidth * BPP_B, kWidth,              \
+                            NEG kHeight);                                      \
+    }                                                                          \
+    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */    \
+    align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight);                 \
+    align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight);               \
+    memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                             \
+    memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                         \
+    FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,      \
+                  kHeight);                                                    \
+    FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth,  \
+                  kHeight);                                                    \
+    for (int i = 0; i < kHeight; ++i) {                                        \
+      for (int j = 0; j < kWidth * 4; ++j) {                                   \
+        EXPECT_EQ(dst_argb32_c[i * kWidth * 4 + j],                            \
+                  dst_argb32_opt[i * kWidth * 4 + j]);                         \
+      }                                                                        \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_uv);                                      \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+    free_aligned_buffer_page_end(dst_argb32_c);                                \
+    free_aligned_buffer_page_end(dst_argb32_opt);                              \
   }
 
 #if defined(ENABLE_FULL_TESTS)
@@ -511,16 +507,15 @@ TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2)
     const int kStrideB =                                                       \
         (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                 \
     align_buffer_page_end(src_argb,                                            \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);    \
-    align_buffer_page_end(dst_argb_c,                                          \
-                          kStrideB * kHeightB * (int)sizeof(TYPE_B));          \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeightB*(int)sizeof(TYPE_B)); \
     align_buffer_page_end(dst_argb_opt,                                        \
-                          kStrideB * kHeightB * (int)sizeof(TYPE_B));          \
+                          kStrideB* kHeightB*(int)sizeof(TYPE_B));             \
     for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
       src_argb[i + OFF] = (fastrand() & 0xff);                                 \
     }                                                                          \
-    memset(dst_argb_c, 1, kStrideB * kHeightB);                                \
-    memset(dst_argb_opt, 101, kStrideB * kHeightB);                            \
+    memset(dst_argb_c, 1, kStrideB* kHeightB);                                 \
+    memset(dst_argb_opt, 101, kStrideB* kHeightB);                             \
     MaskCpuFlags(disable_cpu_flags_);                                          \
     FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_B*)dst_argb_c, \
                      kStrideB, kWidth, NEG kHeight);                           \
@@ -530,49 +525,48 @@ TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2)
                        (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight);  \
     }                                                                          \
     for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {      \
-      ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                               \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                               \
     }                                                                          \
     free_aligned_buffer_page_end(src_argb);                                    \
     free_aligned_buffer_page_end(dst_argb_c);                                  \
     free_aligned_buffer_page_end(dst_argb_opt);                                \
   }
 
-#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B,     \
-                       TYPE_B, EPP_B, STRIDE_B, HEIGHT_B)                   \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                    \
-    for (int times = 0; times < benchmark_iterations_; ++times) {           \
-      const int kWidth = (fastrand() & 63) + 1;                             \
-      const int kHeight = (fastrand() & 31) + 1;                            \
-      const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;  \
-      const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;  \
-      const int kStrideA =                                                  \
-          (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;            \
-      const int kStrideB =                                                  \
-          (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;            \
-      align_buffer_page_end(src_argb,                                       \
-                            kStrideA * kHeightA * (int)sizeof(TYPE_A));     \
-      align_buffer_page_end(dst_argb_c,                                     \
-                            kStrideB * kHeightB * (int)sizeof(TYPE_B));     \
-      align_buffer_page_end(dst_argb_opt,                                   \
-                            kStrideB * kHeightB * (int)sizeof(TYPE_B));     \
-      for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \
-        src_argb[i] = 0xfe;                                                 \
-      }                                                                     \
-      memset(dst_argb_c, 123, kStrideB * kHeightB);                         \
-      memset(dst_argb_opt, 123, kStrideB * kHeightB);                       \
-      MaskCpuFlags(disable_cpu_flags_);                                     \
-      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c,    \
-                       kStrideB, kWidth, kHeight);                          \
-      MaskCpuFlags(benchmark_cpu_info_);                                    \
-      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt,  \
-                       kStrideB, kWidth, kHeight);                          \
-      for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) { \
-        ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                          \
-      }                                                                     \
-      free_aligned_buffer_page_end(src_argb);                               \
-      free_aligned_buffer_page_end(dst_argb_c);                             \
-      free_aligned_buffer_page_end(dst_argb_opt);                           \
-    }                                                                       \
+#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B,        \
+                       TYPE_B, EPP_B, STRIDE_B, HEIGHT_B)                      \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                       \
+    for (int times = 0; times < benchmark_iterations_; ++times) {              \
+      const int kWidth = (fastrand() & 63) + 1;                                \
+      const int kHeight = (fastrand() & 31) + 1;                               \
+      const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;     \
+      const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;     \
+      const int kStrideA =                                                     \
+          (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
+      const int kStrideB =                                                     \
+          (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
+      align_buffer_page_end(src_argb, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
+      align_buffer_page_end(dst_argb_c,                                        \
+                            kStrideB* kHeightB*(int)sizeof(TYPE_B));           \
+      align_buffer_page_end(dst_argb_opt,                                      \
+                            kStrideB* kHeightB*(int)sizeof(TYPE_B));           \
+      for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {    \
+        src_argb[i] = 0xfe;                                                    \
+      }                                                                        \
+      memset(dst_argb_c, 123, kStrideB* kHeightB);                             \
+      memset(dst_argb_opt, 123, kStrideB* kHeightB);                           \
+      MaskCpuFlags(disable_cpu_flags_);                                        \
+      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c,       \
+                       kStrideB, kWidth, kHeight);                             \
+      MaskCpuFlags(benchmark_cpu_info_);                                       \
+      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt,     \
+                       kStrideB, kWidth, kHeight);                             \
+      for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {    \
+        EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
+      }                                                                        \
+      free_aligned_buffer_page_end(src_argb);                                  \
+      free_aligned_buffer_page_end(dst_argb_c);                                \
+      free_aligned_buffer_page_end(dst_argb_opt);                              \
+    }                                                                          \
   }
 
 #if defined(ENABLE_FULL_TESTS)
@@ -678,11 +672,11 @@ TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
     const int kStrideB =                                                      \
         (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                \
     align_buffer_page_end(src_argb,                                           \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);   \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
     align_buffer_page_end(dst_argb_c,                                         \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);   \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
     align_buffer_page_end(dst_argb_opt,                                       \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);   \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
     for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {     \
       src_argb[i + OFF] = (fastrand() & 0xff);                                \
     }                                                                         \
@@ -703,7 +697,7 @@ TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
     FMT_A##To##FMT_B((TYPE_A*)(dst_argb_opt /* src */ + OFF), kStrideA,       \
                      (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight);   \
     for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {     \
-      ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                              \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                              \
     }                                                                         \
     free_aligned_buffer_page_end(src_argb);                                   \
     free_aligned_buffer_page_end(dst_argb_c);                                 \
@@ -797,14 +791,14 @@ TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
         (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
     const int kStrideB =                                                     \
         (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
-    align_buffer_page_end(src_argb, kStrideA * kHeightA + OFF);              \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                  \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                \
+    align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF);               \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);                   \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);                 \
     for (int i = 0; i < kStrideA * kHeightA; ++i) {                          \
       src_argb[i + OFF] = (fastrand() & 0xff);                               \
     }                                                                        \
-    memset(dst_argb_c, 1, kStrideB * kHeightB);                              \
-    memset(dst_argb_opt, 101, kStrideB * kHeightB);                          \
+    memset(dst_argb_c, 1, kStrideB* kHeightB);                               \
+    memset(dst_argb_opt, 101, kStrideB* kHeightB);                           \
     MaskCpuFlags(disable_cpu_flags_);                                        \
     FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, \
                              NULL, kWidth, NEG kHeight);                     \
@@ -814,7 +808,7 @@ TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
                                kStrideB, NULL, kWidth, NEG kHeight);         \
     }                                                                        \
     for (int i = 0; i < kStrideB * kHeightB; ++i) {                          \
-      ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
     }                                                                        \
     free_aligned_buffer_page_end(src_argb);                                  \
     free_aligned_buffer_page_end(dst_argb_c);                                \
@@ -833,14 +827,14 @@ TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
           (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
       const int kStrideB =                                                     \
           (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
-      align_buffer_page_end(src_argb, kStrideA * kHeightA);                    \
-      align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                  \
-      align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                \
+      align_buffer_page_end(src_argb, kStrideA* kHeightA);                     \
+      align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);                   \
+      align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);                 \
       for (int i = 0; i < kStrideA * kHeightA; ++i) {                          \
         src_argb[i] = (fastrand() & 0xff);                                     \
       }                                                                        \
-      memset(dst_argb_c, 123, kStrideB * kHeightB);                            \
-      memset(dst_argb_opt, 123, kStrideB * kHeightB);                          \
+      memset(dst_argb_c, 123, kStrideB* kHeightB);                             \
+      memset(dst_argb_opt, 123, kStrideB* kHeightB);                           \
       MaskCpuFlags(disable_cpu_flags_);                                        \
       FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_c, kStrideB, NULL, \
                                kWidth, kHeight);                               \
@@ -848,7 +842,7 @@ TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
       FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_opt, kStrideB,     \
                                NULL, kWidth, kHeight);                         \
       for (int i = 0; i < kStrideB * kHeightB; ++i) {                          \
-        ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
+        EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
       }                                                                        \
       free_aligned_buffer_page_end(src_argb);                                  \
       free_aligned_buffer_page_end(dst_argb_c);                                \
@@ -891,16 +885,15 @@ TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
     const int kStrideA =                                                       \
         (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                 \
     align_buffer_page_end(src_argb,                                            \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);    \
-    align_buffer_page_end(dst_argb_c,                                          \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A));          \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
+    align_buffer_page_end(dst_argb_c, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
     align_buffer_page_end(dst_argb_opt,                                        \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A));          \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A));             \
     for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
       src_argb[i + OFF] = (fastrand() & 0xff);                                 \
     }                                                                          \
-    memset(dst_argb_c, 1, kStrideA * kHeightA);                                \
-    memset(dst_argb_opt, 101, kStrideA * kHeightA);                            \
+    memset(dst_argb_c, 1, kStrideA* kHeightA);                                 \
+    memset(dst_argb_opt, 101, kStrideA* kHeightA);                             \
     MaskCpuFlags(disable_cpu_flags_);                                          \
     FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_c,         \
              kStrideA, kWidth, NEG kHeight);                                   \
@@ -916,8 +909,8 @@ TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
     FMT_ATOB((TYPE_A*)dst_argb_opt, kStrideA, (TYPE_A*)dst_argb_opt, kStrideA, \
              kWidth, NEG kHeight);                                             \
     for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
-      ASSERT_EQ(src_argb[i + OFF], dst_argb_opt[i]);                           \
-      ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                               \
+      EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]);                           \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                               \
     }                                                                          \
     free_aligned_buffer_page_end(src_argb);                                    \
     free_aligned_buffer_page_end(dst_argb_c);                                  \
@@ -952,12 +945,12 @@ TESTEND(AB64ToAR64, uint16_t, 4, 4, 1)
     const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                      \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
     align_buffer_page_end(src_u, kSizeUV + OFF);                               \
     align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_a, kWidth * kHeight + OFF);                      \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);             \
+    align_buffer_page_end(src_a, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
     for (int i = 0; i < kWidth * kHeight; ++i) {                               \
       src_y[i + OFF] = (fastrand() & 0xff);                                    \
       src_a[i + OFF] = (fastrand() & 0xff);                                    \
@@ -981,7 +974,7 @@ TESTEND(AB64ToAR64, uint16_t, 4, 4, 1)
                             ATTEN);                                            \
     }                                                                          \
     for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
-      ASSERT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                   \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                   \
     }                                                                          \
     free_aligned_buffer_page_end(src_y);                                       \
     free_aligned_buffer_page_end(src_u);                                       \
@@ -1171,7 +1164,7 @@ TEST_F(LibYUVConvertTest, TestYToARGB) {
            argb[i * 4 + 0], argb[i * 4 + 1], argb[i * 4 + 2], argb[i * 4 + 3]);
   }
   for (int i = 0; i < 32; ++i) {
-    ASSERT_EQ(expectedg[i], argb[i * 4 + 0]);
+    EXPECT_EQ(expectedg[i], argb[i * 4 + 0]);
   }
 }
 
@@ -1193,7 +1186,7 @@ TEST_F(LibYUVConvertTest, TestNoDither) {
                      benchmark_width_ * 2, kNoDither4x4, benchmark_width_,
                      benchmark_height_);
   for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
-    ASSERT_EQ(dst_rgb565[i], dst_rgb565dither[i]);
+    EXPECT_EQ(dst_rgb565[i], dst_rgb565dither[i]);
   }
 
   free_aligned_buffer_page_end(src_argb);
@@ -1230,7 +1223,7 @@ TEST_F(LibYUVConvertTest, TestDither) {
                benchmark_width_ * 4, benchmark_width_, benchmark_height_);
 
   for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
-    ASSERT_NEAR(dst_argb[i], dst_argbdither[i], 9);
+    EXPECT_NEAR(dst_argb[i], dst_argbdither[i], 9);
   }
   free_aligned_buffer_page_end(src_argb);
   free_aligned_buffer_page_end(dst_rgb565);
@@ -1247,11 +1240,11 @@ TEST_F(LibYUVConvertTest, TestDither) {
     const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                      \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
     align_buffer_page_end(src_u, kSizeUV + OFF);                               \
     align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);             \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
     for (int i = 0; i < kWidth * kHeight; ++i) {                               \
       src_y[i + OFF] = (fastrand() & 0xff);                                    \
     }                                                                          \
@@ -1272,16 +1265,16 @@ TEST_F(LibYUVConvertTest, TestDither) {
           dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight);            \
     }                                                                          \
     /* Convert to ARGB so 565 is expanded to bytes that can be compared. */    \
-    align_buffer_page_end(dst_argb32_c, kWidth * BPP_C * kHeight);             \
-    align_buffer_page_end(dst_argb32_opt, kWidth * BPP_C * kHeight);           \
-    memset(dst_argb32_c, 2, kWidth * BPP_C * kHeight);                         \
-    memset(dst_argb32_opt, 102, kWidth * BPP_C * kHeight);                     \
+    align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight);               \
+    align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight);             \
+    memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight);                           \
+    memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight);                       \
     FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \
                      kWidth, kHeight);                                         \
     FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt,             \
                      kWidth * BPP_C, kWidth, kHeight);                         \
     for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                       \
-      ASSERT_EQ(dst_argb32_c[i], dst_argb32_opt[i]);                           \
+      EXPECT_EQ(dst_argb32_c[i], dst_argb32_opt[i]);                           \
     }                                                                          \
     free_aligned_buffer_page_end(src_y);                                       \
     free_aligned_buffer_page_end(src_u);                                       \
@@ -1324,10 +1317,10 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
     const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                    \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
     align_buffer_page_end(src_u, kSizeUV + OFF);                              \
     align_buffer_page_end(src_v, kSizeUV + OFF);                              \
-    align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);               \
     for (int i = 0; i < kWidth * kHeight; ++i) {                              \
       src_y[i + OFF] = (fastrand() & 0xff);                                   \
     }                                                                         \
@@ -1341,8 +1334,8 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
                           kWidth, NEG kHeight);                               \
     /* Convert to a 3rd format in 1 step and 2 steps and compare  */          \
     const int kStrideC = kWidth * BPP_C;                                      \
-    align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF);              \
-    align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF);             \
+    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);              \
     memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                          \
     memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                         \
     for (int i = 0; i < benchmark_iterations_; ++i) {                         \
@@ -1354,7 +1347,7 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
                        kStrideC, kWidth, kHeight);                            \
     }                                                                         \
     for (int i = 0; i < kStrideC * kHeight; ++i) {                            \
-      ASSERT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                   \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                   \
     }                                                                         \
     free_aligned_buffer_page_end(src_y);                                      \
     free_aligned_buffer_page_end(src_u);                                      \
@@ -1471,14 +1464,14 @@ TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
     const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                     \
     const int kSizeUV =                                                        \
         SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y);          \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                      \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
     align_buffer_page_end(src_u, kSizeUV + OFF);                               \
     align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_a, kWidth * kHeight + OFF);                      \
-    align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF);               \
+    align_buffer_page_end(src_a, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);                \
     const int kStrideC = kWidth * BPP_C;                                       \
-    align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);               \
     memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                           \
     memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                           \
     memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                          \
@@ -1506,7 +1499,7 @@ TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
         src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth,        \
         dst_argb_c + OFF, kStrideC, kWidth, NEG kHeight, ATTEN);               \
     for (int i = 0; i < kStrideC * kHeight; ++i) {                             \
-      ASSERT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                    \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                    \
     }                                                                          \
     free_aligned_buffer_page_end(src_y);                                       \
     free_aligned_buffer_page_end(src_u);                                       \
@@ -1585,16 +1578,16 @@ TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
     const int kHeight = benchmark_height_;                                     \
     const int kStrideA = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
     const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                     \
-    align_buffer_page_end(src_argb_a, kStrideA * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF);               \
+    align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);                \
     MemRandomize(src_argb_a + OFF, kStrideA * kHeight);                        \
     memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                           \
     FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB,   \
                      kWidth, NEG kHeight);                                     \
     /* Convert to a 3rd format in 1 step and 2 steps and compare  */           \
     const int kStrideC = kWidth * BPP_C;                                       \
-    align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);               \
     memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                           \
     memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                          \
     for (int i = 0; i < benchmark_iterations_; ++i) {                          \
@@ -1605,10 +1598,10 @@ TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
                        kStrideC, kWidth, kHeight);                             \
     }                                                                          \
     for (int i = 0; i < kStrideC * kHeight; i += 4) {                          \
-      ASSERT_EQ(dst_argb_c[i + OFF + 0], dst_argb_bc[i + OFF + 0]);            \
-      ASSERT_EQ(dst_argb_c[i + OFF + 1], dst_argb_bc[i + OFF + 1]);            \
-      ASSERT_EQ(dst_argb_c[i + OFF + 2], dst_argb_bc[i + OFF + 2]);            \
-      ASSERT_NEAR(dst_argb_c[i + OFF + 3], dst_argb_bc[i + OFF + 3], 64);      \
+      EXPECT_EQ(dst_argb_c[i + OFF + 0], dst_argb_bc[i + OFF + 0]);            \
+      EXPECT_EQ(dst_argb_c[i + OFF + 1], dst_argb_bc[i + OFF + 1]);            \
+      EXPECT_EQ(dst_argb_c[i + OFF + 2], dst_argb_bc[i + OFF + 2]);            \
+      EXPECT_NEAR(dst_argb_c[i + OFF + 3], dst_argb_bc[i + OFF + 3], 64);      \
     }                                                                          \
     free_aligned_buffer_page_end(src_argb_a);                                  \
     free_aligned_buffer_page_end(dst_argb_b);                                  \
@@ -1671,12 +1664,12 @@ TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
                         2,  // crop height
                         kRotate90, FOURCC_ARGB);
 
-  ASSERT_EQ(r, 0);
+  EXPECT_EQ(r, 0);
   // 90 degrees rotation, no conversion
-  ASSERT_EQ(dst[0], src[2]);
-  ASSERT_EQ(dst[1], src[0]);
-  ASSERT_EQ(dst[2], src[3]);
-  ASSERT_EQ(dst[3], src[1]);
+  EXPECT_EQ(dst[0], src[2]);
+  EXPECT_EQ(dst[1], src[0]);
+  EXPECT_EQ(dst[2], src[3]);
+  EXPECT_EQ(dst[3], src[1]);
 }
 
 #ifdef HAS_ARGBTOAR30ROW_AVX2
@@ -1704,7 +1697,7 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
     }
   }
   for (int i = 0; i < kPixels * 4; ++i) {
-    ASSERT_EQ(dst_opt[i], dst_c[i]);
+    EXPECT_EQ(dst_opt[i], dst_c[i]);
   }
 
   free_aligned_buffer_page_end(src);
@@ -1738,7 +1731,7 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
     }
   }
   for (int i = 0; i < kPixels * 4; ++i) {
-    ASSERT_EQ(dst_opt[i], dst_c[i]);
+    EXPECT_EQ(dst_opt[i], dst_c[i]);
   }
 
   free_aligned_buffer_page_end(src);
@@ -1805,11 +1798,11 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
     const int kBpc = 2;                                                       \
-    align_buffer_page_end(src_y, kWidth * kHeight * kBpc + SOFF);             \
-    align_buffer_page_end(src_u, kSizeUV * kBpc + SOFF);                      \
-    align_buffer_page_end(src_v, kSizeUV * kBpc + SOFF);                      \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + DOFF);             \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + DOFF);           \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);               \
+    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);              \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);            \
     for (int i = 0; i < kWidth * kHeight; ++i) {                              \
       reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & FMT_MASK); \
     }                                                                         \
@@ -1834,7 +1827,7 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
           dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight);                \
     }                                                                         \
     for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                      \
-      ASSERT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);                \
+      EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);                \
     }                                                                         \
     free_aligned_buffer_page_end(src_y);                                      \
     free_aligned_buffer_page_end(src_u);                                      \
@@ -1920,12 +1913,12 @@ TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30Filter, 4, 4, 1)
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
     const int kBpc = 2;                                                        \
-    align_buffer_page_end(src_y, kWidth * kHeight * kBpc + OFF);               \
-    align_buffer_page_end(src_u, kSizeUV * kBpc + OFF);                        \
-    align_buffer_page_end(src_v, kSizeUV * kBpc + OFF);                        \
-    align_buffer_page_end(src_a, kWidth * kHeight * kBpc + OFF);               \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);             \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF);                 \
+    align_buffer_page_end(src_u, kSizeUV* kBpc + OFF);                         \
+    align_buffer_page_end(src_v, kSizeUV* kBpc + OFF);                         \
+    align_buffer_page_end(src_a, kWidth* kHeight* kBpc + OFF);                 \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
     for (int i = 0; i < kWidth * kHeight; ++i) {                               \
       reinterpret_cast<uint16_t*>(src_y + OFF)[i] =                            \
           (fastrand() & ((1 << S_DEPTH) - 1));                                 \
@@ -1957,7 +1950,7 @@ TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30Filter, 4, 4, 1)
           dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, ATTEN);           \
     }                                                                          \
     for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
-      ASSERT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                   \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                   \
     }                                                                          \
     free_aligned_buffer_page_end(src_y);                                       \
     free_aligned_buffer_page_end(src_u);                                       \
@@ -2153,10 +2146,10 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2;                    \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;         \
     const int kBpc = 2;                                                        \
-    align_buffer_page_end(src_y, kWidth * kHeight * kBpc + SOFF);              \
-    align_buffer_page_end(src_uv, kSizeUV * kBpc + SOFF);                      \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + DOFF);              \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + DOFF);            \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);                \
+    align_buffer_page_end(src_uv, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);               \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);             \
     for (int i = 0; i < kWidth * kHeight; ++i) {                               \
       reinterpret_cast<uint16_t*>(src_y + SOFF)[i] =                           \
           (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH)));                 \
@@ -2180,7 +2173,7 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
                             NEG kHeight);                                      \
     }                                                                          \
     for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
-      ASSERT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);                 \
+      EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);                 \
     }                                                                          \
     free_aligned_buffer_page_end(src_y);                                       \
     free_aligned_buffer_page_end(src_uv);                                      \
@@ -2323,10 +2316,10 @@ TEST_F(LibYUVConvertTest, TestH420ToARGB) {
     ++histogram_r[r];
     // Reference formula for Y channel contribution in YUV to RGB conversions:
     int expected_y = Clamp(static_cast<int>((i - 16) * 1.164f + 0.5f));
-    ASSERT_EQ(b, expected_y);
-    ASSERT_EQ(g, expected_y);
-    ASSERT_EQ(r, expected_y);
-    ASSERT_EQ(a, 255);
+    EXPECT_EQ(b, expected_y);
+    EXPECT_EQ(g, expected_y);
+    EXPECT_EQ(r, expected_y);
+    EXPECT_EQ(a, 255);
   }
 
   int count_b = 0;
@@ -2384,10 +2377,10 @@ TEST_F(LibYUVConvertTest, TestH010ToARGB) {
     ++histogram_g[g];
     ++histogram_r[r];
     int expected_y = Clamp(static_cast<int>((i - 64) * 1.164f / 4));
-    ASSERT_NEAR(b, expected_y, 1);
-    ASSERT_NEAR(g, expected_y, 1);
-    ASSERT_NEAR(r, expected_y, 1);
-    ASSERT_EQ(a, 255);
+    EXPECT_NEAR(b, expected_y, 1);
+    EXPECT_NEAR(g, expected_y, 1);
+    EXPECT_NEAR(r, expected_y, 1);
+    EXPECT_EQ(a, 255);
   }
 
   int count_b = 0;
@@ -2448,10 +2441,10 @@ TEST_F(LibYUVConvertTest, TestH010ToAR30) {
     ++histogram_g[g10];
     ++histogram_r[r10];
     int expected_y = Clamp10(static_cast<int>((i - 64) * 1.164f + 0.5));
-    ASSERT_NEAR(b10, expected_y, 4);
-    ASSERT_NEAR(g10, expected_y, 4);
-    ASSERT_NEAR(r10, expected_y, 4);
-    ASSERT_EQ(a2, 3);
+    EXPECT_NEAR(b10, expected_y, 4);
+    EXPECT_NEAR(g10, expected_y, 4);
+    EXPECT_NEAR(r10, expected_y, 4);
+    EXPECT_EQ(a2, 3);
   }
 
   int count_b = 0;
@@ -2512,10 +2505,10 @@ TEST_F(LibYUVConvertTest, TestH010ToAB30) {
     ++histogram_g[g10];
     ++histogram_r[r10];
     int expected_y = Clamp10(static_cast<int>((i - 64) * 1.164f));
-    ASSERT_NEAR(b10, expected_y, 4);
-    ASSERT_NEAR(g10, expected_y, 4);
-    ASSERT_NEAR(r10, expected_y, 4);
-    ASSERT_EQ(a2, 3);
+    EXPECT_NEAR(b10, expected_y, 4);
+    EXPECT_NEAR(g10, expected_y, 4);
+    EXPECT_NEAR(r10, expected_y, 4);
+    EXPECT_EQ(a2, 3);
   }
 
   int count_b = 0;
@@ -2574,10 +2567,10 @@ TEST_F(LibYUVConvertTest, TestH420ToAR30) {
     ++histogram_g[g10];
     ++histogram_r[r10];
     int expected_y = Clamp10(static_cast<int>((i - 16) * 1.164f * 4.f));
-    ASSERT_NEAR(b10, expected_y, 4);
-    ASSERT_NEAR(g10, expected_y, 4);
-    ASSERT_NEAR(r10, expected_y, 4);
-    ASSERT_EQ(a2, 3);
+    EXPECT_NEAR(b10, expected_y, 4);
+    EXPECT_NEAR(g10, expected_y, 4);
+    EXPECT_NEAR(r10, expected_y, 4);
+    EXPECT_EQ(a2, 3);
   }
 
   int count_b = 0;
@@ -2624,34 +2617,34 @@ TEST_F(LibYUVConvertTest, TestI400) {
   I400ToARGBMatrix(orig_i400, 0, argb_pixels_2020_i400, 0, &kYuv2020Constants,
                    kSize, 1);
 
-  ASSERT_EQ(0, argb_pixels_i400[0]);
-  ASSERT_EQ(0, argb_pixels_j400[0]);
-  ASSERT_EQ(0, argb_pixels_jpeg_i400[0]);
-  ASSERT_EQ(0, argb_pixels_h709_i400[0]);
-  ASSERT_EQ(0, argb_pixels_2020_i400[0]);
-  ASSERT_EQ(0, argb_pixels_i400[16 * 4]);
-  ASSERT_EQ(16, argb_pixels_j400[16 * 4]);
-  ASSERT_EQ(16, argb_pixels_jpeg_i400[16 * 4]);
-  ASSERT_EQ(0, argb_pixels_h709_i400[16 * 4]);
-  ASSERT_EQ(0, argb_pixels_2020_i400[16 * 4]);
-  ASSERT_EQ(130, argb_pixels_i400[128 * 4]);
-  ASSERT_EQ(128, argb_pixels_j400[128 * 4]);
-  ASSERT_EQ(128, argb_pixels_jpeg_i400[128 * 4]);
-  ASSERT_EQ(130, argb_pixels_h709_i400[128 * 4]);
-  ASSERT_EQ(130, argb_pixels_2020_i400[128 * 4]);
-  ASSERT_EQ(255, argb_pixels_i400[255 * 4]);
-  ASSERT_EQ(255, argb_pixels_j400[255 * 4]);
-  ASSERT_EQ(255, argb_pixels_jpeg_i400[255 * 4]);
-  ASSERT_EQ(255, argb_pixels_h709_i400[255 * 4]);
-  ASSERT_EQ(255, argb_pixels_2020_i400[255 * 4]);
+  EXPECT_EQ(0, argb_pixels_i400[0]);
+  EXPECT_EQ(0, argb_pixels_j400[0]);
+  EXPECT_EQ(0, argb_pixels_jpeg_i400[0]);
+  EXPECT_EQ(0, argb_pixels_h709_i400[0]);
+  EXPECT_EQ(0, argb_pixels_2020_i400[0]);
+  EXPECT_EQ(0, argb_pixels_i400[16 * 4]);
+  EXPECT_EQ(16, argb_pixels_j400[16 * 4]);
+  EXPECT_EQ(16, argb_pixels_jpeg_i400[16 * 4]);
+  EXPECT_EQ(0, argb_pixels_h709_i400[16 * 4]);
+  EXPECT_EQ(0, argb_pixels_2020_i400[16 * 4]);
+  EXPECT_EQ(130, argb_pixels_i400[128 * 4]);
+  EXPECT_EQ(128, argb_pixels_j400[128 * 4]);
+  EXPECT_EQ(128, argb_pixels_jpeg_i400[128 * 4]);
+  EXPECT_EQ(130, argb_pixels_h709_i400[128 * 4]);
+  EXPECT_EQ(130, argb_pixels_2020_i400[128 * 4]);
+  EXPECT_EQ(255, argb_pixels_i400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_j400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_jpeg_i400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_h709_i400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_2020_i400[255 * 4]);
 
   for (int i = 0; i < kSize * 4; ++i) {
     if ((i & 3) == 3) {
-      ASSERT_EQ(255, argb_pixels_j400[i]);
+      EXPECT_EQ(255, argb_pixels_j400[i]);
     } else {
-      ASSERT_EQ(i / 4, argb_pixels_j400[i]);
+      EXPECT_EQ(i / 4, argb_pixels_j400[i]);
     }
-    ASSERT_EQ(argb_pixels_jpeg_i400[i], argb_pixels_j400[i]);
+    EXPECT_EQ(argb_pixels_jpeg_i400[i], argb_pixels_j400[i]);
   }
 
   free_aligned_buffer_page_end(orig_i400);
@@ -2678,7 +2671,7 @@ TEST_F(LibYUVConvertTest, TestARGBToRGB24) {
   ARGBToRGB24(argb_pixels, 0, dest_rgb24, 0, kSize, 1);
 
   for (int i = 0; i < kSize * 3; ++i) {
-    ASSERT_EQ(orig_rgb24[i], dest_rgb24[i]);
+    EXPECT_EQ(orig_rgb24[i], dest_rgb24[i]);
   }
 
   free_aligned_buffer_page_end(orig_rgb24);
@@ -2697,7 +2690,7 @@ TEST_F(LibYUVConvertTest, TestARGBToRGB565) {
   }
   ARGBToRGB565(&orig_pixels[0][0], 0, &dest_rgb565[0][0], 0, 256, 1);
   uint32_t checksum = HashDjb2(&dest_rgb565[0][0], sizeof(dest_rgb565), 5381);
-  ASSERT_EQ(610919429u, checksum);
+  EXPECT_EQ(610919429u, checksum);
 }
 
 TEST_F(LibYUVConvertTest, TestYUY2ToARGB) {
@@ -2712,9 +2705,9 @@ TEST_F(LibYUVConvertTest, TestYUY2ToARGB) {
   YUY2ToARGB(&orig_pixels[0][0], 0, &dest_argb[0][0], 0, 256, 1);
   uint32_t checksum = HashDjb2(&dest_argb[0][0], sizeof(dest_argb), 5381);
 #if defined(LIBYUV_UNLIMITED_DATA)
-  ASSERT_EQ(10343289u, checksum);
+  EXPECT_EQ(10343289u, checksum);
 #else
-  ASSERT_EQ(3486643515u, checksum);
+  EXPECT_EQ(3486643515u, checksum);
 #endif
 }
 
@@ -2730,9 +2723,9 @@ TEST_F(LibYUVConvertTest, TestUYVYToARGB) {
   UYVYToARGB(&orig_pixels[0][0], 0, &dest_argb[0][0], 0, 256, 1);
   uint32_t checksum = HashDjb2(&dest_argb[0][0], sizeof(dest_argb), 5381);
 #if defined(LIBYUV_UNLIMITED_DATA)
-  ASSERT_EQ(10343289u, checksum);
+  EXPECT_EQ(10343289u, checksum);
 #else
-  ASSERT_EQ(3486643515u, checksum);
+  EXPECT_EQ(3486643515u, checksum);
 #endif
 }
 
@@ -2810,9 +2803,9 @@ TEST_F(LibYUVConvertTest, TestARGBToUVRow) {
   printf("\n");
 
   uint32_t checksum_u = HashDjb2(&dest_u[0], sizeof(dest_u), 5381);
-  ASSERT_EQ(192508756u, checksum_u);
+  EXPECT_EQ(192508756u, checksum_u);
   uint32_t checksum_v = HashDjb2(&dest_v[0], sizeof(dest_v), 5381);
-  ASSERT_EQ(2590663990u, checksum_v);
+  EXPECT_EQ(2590663990u, checksum_v);
 }
 #endif
 
@@ -2838,23 +2831,16 @@ TEST_F(LibYUVConvertTest, TestARGBToUVMatrixRow_Opt) {
         memset(dest_v_c, 0, sizeof(dest_v_c));
         memset(dest_u_opt, 0, sizeof(dest_u_opt));
         memset(dest_v_opt, 0, sizeof(dest_v_opt));
-
+        
         int src_stride = (height == 1) ? 0 : kMaxWidth * 4;
 
-        ARGBToUVMatrixRow_C(&orig_argb_pixels[0], src_stride, &dest_u_c[0],
-                            &dest_v_c[0], width, &kArgbI601Constants);
-        ARGBToUVMatrixRow_Any_NEON(&orig_argb_pixels[0], src_stride,
-                                   &dest_u_opt[0], &dest_v_opt[0], width,
-                                   &kArgbI601Constants);
+        ARGBToUVMatrixRow_C(&orig_argb_pixels[0], src_stride, &dest_u_c[0], &dest_v_c[0], width, &kArgbI601Constants);
+        ARGBToUVMatrixRow_Any_NEON(&orig_argb_pixels[0], src_stride, &dest_u_opt[0], &dest_v_opt[0], width, &kArgbI601Constants);
 
         int half_width = (width + 1) / 2;
         for (int i = 0; i < half_width; ++i) {
-          ASSERT_EQ(dest_u_c[i], dest_u_opt[i])
-              << "u mismatch at " << i << " width " << width << " height "
-              << height;
-          ASSERT_EQ(dest_v_c[i], dest_v_opt[i])
-              << "v mismatch at " << i << " width " << width << " height "
-              << height;
+          EXPECT_EQ(dest_u_c[i], dest_u_opt[i]) << "u mismatch at " << i << " width " << width << " height " << height;
+          EXPECT_EQ(dest_v_c[i], dest_v_opt[i]) << "v mismatch at " << i << " width " << width << " height " << height;
         }
       }
     }
@@ -2867,7 +2853,6 @@ TEST_F(LibYUVConvertTest, TestARGBToUVMatrixRow_Opt) {
     (defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__))
 // TODO(fbarchard): Consider _set_new_mode(0) to make malloc return NULL
 
-#ifndef DISABLE_SLOW_TESTS
 TEST_F(LibYUVConvertTest, TestI400LargeSize) {
   // The width and height are chosen as follows:
   // - kWidth * kHeight is not a multiple of 8: This lets us to use the Any
@@ -2911,18 +2896,18 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
   for (int i = 0; i < kWidth * kHeight; ++i) {
     orig_i400[i] = i % 256;
   }
-  ASSERT_EQ(I400ToARGBMatrix(orig_i400, kStride, dest_argb, kWidth,
+  EXPECT_EQ(I400ToARGBMatrix(orig_i400, kStride, dest_argb, kWidth,
                              &kYuvJPEGConstants, kWidth, kHeight),
             0);
   free_aligned_buffer_page_end(dest_argb);
   free_aligned_buffer_page_end(orig_i400);
 }
-#endif  // DISABLE_SLOW_TESTS
 #endif  // !defined(DISABLE_SLOW_TESTS) && \
         // (defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__))
 
 #endif  // !defined(LEAN_TESTS)
 
+
 #define TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
                    SUBSAMP_Y, W1280, N, NEG, OFF)                              \
   TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                             \
@@ -2935,17 +2920,17 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2;                    \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
     align_buffer_page_end(src_argb,                                            \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);    \
-    align_buffer_page_end(dst_y_c, kStrideY * kHeight);                        \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
+    align_buffer_page_end(dst_y_c, kStrideY* kHeight);                         \
     align_buffer_page_end(dst_uv_c, kSizeUV);                                  \
-    align_buffer_page_end(dst_y_opt, kStrideY * kHeight);                      \
+    align_buffer_page_end(dst_y_opt, kStrideY* kHeight);                       \
     align_buffer_page_end(dst_uv_opt, kSizeUV);                                \
     for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
       src_argb[i + OFF] = (fastrand() & 0xff);                                 \
     }                                                                          \
-    memset(dst_y_c, 1, kStrideY * kHeight);                                    \
+    memset(dst_y_c, 1, kStrideY* kHeight);                                     \
     memset(dst_uv_c, 2, kSizeUV);                                              \
-    memset(dst_y_opt, 101, kStrideY * kHeight);                                \
+    memset(dst_y_opt, 101, kStrideY* kHeight);                                 \
     memset(dst_uv_opt, 102, kSizeUV);                                          \
     MaskCpuFlags(disable_cpu_flags_);                                          \
     FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, dst_y_c, kStrideY,   \
@@ -2956,10 +2941,10 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
                        kStrideY, dst_uv_opt, kStrideUV, kWidth, NEG kHeight);  \
     }                                                                          \
     for (int i = 0; i < kStrideY * kHeight; ++i) {                             \
-      ASSERT_EQ(dst_y_c[i], dst_y_opt[i]);                                     \
+      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                     \
     }                                                                          \
     for (int i = 0; i < kSizeUV; ++i) {                                        \
-      ASSERT_EQ(dst_uv_c[i], dst_uv_opt[i]);                                   \
+      EXPECT_EQ(dst_uv_c[i], dst_uv_opt[i]);                                   \
     }                                                                          \
     free_aligned_buffer_page_end(src_argb);                                    \
     free_aligned_buffer_page_end(dst_y_c);                                     \
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index a38e7fdf9..3d5ce3799 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -51,9 +51,9 @@ namespace libyuv {
 #define ABGRToABGR ARGBCopy
 
 // subsample amount uses a divide.
-#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
 
-#define ALIGNINT(V, ALIGN) (((V) + (ALIGN) - 1) / (ALIGN) * (ALIGN))
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
 
 // Planar test
 
@@ -78,19 +78,17 @@ namespace libyuv {
     const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
     const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
     const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth * kHeight * SRC_BPC + OFF);           \
+    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
     align_buffer_page_end(src_u,                                              \
-                          kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF);    \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
     align_buffer_page_end(src_v,                                              \
-                          kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF);    \
-    align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC);               \
-    align_buffer_page_end(dst_u_c, kDstHalfWidth * kDstHalfHeight * DST_BPC); \
-    align_buffer_page_end(dst_v_c, kDstHalfWidth * kDstHalfHeight * DST_BPC); \
-    align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC);             \
-    align_buffer_page_end(dst_u_opt,                                          \
-                          kDstHalfWidth * kDstHalfHeight * DST_BPC);          \
-    align_buffer_page_end(dst_v_opt,                                          \
-                          kDstHalfWidth * kDstHalfHeight * DST_BPC);          \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
+    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
     MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
     MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
     MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
@@ -104,12 +102,12 @@ namespace libyuv {
       src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1);                       \
       src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1);                       \
     }                                                                         \
-    memset(dst_y_c, 1, kWidth * kHeight * DST_BPC);                           \
-    memset(dst_u_c, 2, kDstHalfWidth * kDstHalfHeight * DST_BPC);             \
-    memset(dst_v_c, 3, kDstHalfWidth * kDstHalfHeight * DST_BPC);             \
-    memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC);                       \
-    memset(dst_u_opt, 102, kDstHalfWidth * kDstHalfHeight * DST_BPC);         \
-    memset(dst_v_opt, 103, kDstHalfWidth * kDstHalfHeight * DST_BPC);         \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
+    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
+    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
     MaskCpuFlags(disable_cpu_flags_);                                         \
     SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
         src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,      \
@@ -127,11 +125,11 @@ namespace libyuv {
           NEG kHeight);                                                       \
     }                                                                         \
     for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                    \
-      ASSERT_EQ(dst_y_c[i], dst_y_opt[i]);                                    \
+      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                    \
     }                                                                         \
     for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) {      \
-      ASSERT_EQ(dst_u_c[i], dst_u_opt[i]);                                    \
-      ASSERT_EQ(dst_v_c[i], dst_v_opt[i]);                                    \
+      EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);                                    \
+      EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);                                    \
     }                                                                         \
     free_aligned_buffer_page_end(dst_y_c);                                    \
     free_aligned_buffer_page_end(dst_u_c);                                    \
@@ -214,15 +212,15 @@ TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)
     const int kHeight = benchmark_height_;                                    \
     const int kSizeUV =                                                       \
         SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
     align_buffer_page_end(src_uv,                                             \
-                          kSizeUV * ((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);     \
-    align_buffer_page_end(dst_y_c, kWidth * kHeight);                         \
+                          kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);       \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
     align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
                                        SUBSAMPLE(kHeight, SUBSAMP_Y));        \
     align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
                                        SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                       \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
     align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
                                          SUBSAMPLE(kHeight, SUBSAMP_Y));      \
     align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
@@ -241,12 +239,12 @@ TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)
             (fastrand() & 0xff);                                              \
       }                                                                       \
     }                                                                         \
-    memset(dst_y_c, 1, kWidth * kHeight);                                     \
+    memset(dst_y_c, 1, kWidth* kHeight);                                      \
     memset(dst_u_c, 2,                                                        \
            SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
     memset(dst_v_c, 3,                                                        \
            SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    memset(dst_y_opt, 101, kWidth * kHeight);                                 \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
     memset(dst_u_opt, 102,                                                    \
            SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
     memset(dst_v_opt, 103,                                                    \
@@ -267,18 +265,18 @@ TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)
     }                                                                         \
     for (int i = 0; i < kHeight; ++i) {                                       \
       for (int j = 0; j < kWidth; ++j) {                                      \
-        ASSERT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
       }                                                                       \
     }                                                                         \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
       for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
-        ASSERT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
+        EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
                   dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
       }                                                                       \
     }                                                                         \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
       for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
-        ASSERT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
+        EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
                   dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
       }                                                                       \
     }                                                                         \
@@ -361,17 +359,17 @@ static int I400ToNV21(const uint8_t* src_y,
     const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
     const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
     const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth * kHeight * SRC_BPC + OFF);           \
+    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
     align_buffer_page_end(src_u,                                              \
-                          kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF);    \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
     align_buffer_page_end(src_v,                                              \
-                          kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF);    \
-    align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC);               \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
     align_buffer_page_end(dst_uv_c,                                           \
-                          kDstHalfWidth * kDstHalfHeight * DST_BPC * 2);      \
-    align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC);             \
+                          kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);        \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
     align_buffer_page_end(dst_uv_opt,                                         \
-                          kDstHalfWidth * kDstHalfHeight * DST_BPC * 2);      \
+                          kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);        \
     MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
     MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
     MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
@@ -385,10 +383,10 @@ static int I400ToNV21(const uint8_t* src_y,
       src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1);                       \
       src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1);                       \
     }                                                                         \
-    memset(dst_y_c, 1, kWidth * kHeight * DST_BPC);                           \
-    memset(dst_uv_c, 2, kDstHalfWidth * kDstHalfHeight * DST_BPC * 2);        \
-    memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC);                       \
-    memset(dst_uv_opt, 102, kDstHalfWidth * kDstHalfHeight * DST_BPC * 2);    \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_uv_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);          \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_uv_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);      \
     MaskCpuFlags(disable_cpu_flags_);                                         \
     SRC_FMT_PLANAR##To##FMT_PLANAR(src_y_p, kWidth, src_u_p, kSrcHalfWidth,   \
                                    src_v_p, kSrcHalfWidth,                    \
@@ -404,10 +402,10 @@ static int I400ToNV21(const uint8_t* src_y,
           NEG kHeight);                                                       \
     }                                                                         \
     for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                    \
-      ASSERT_EQ(dst_y_c[i], dst_y_opt[i]);                                    \
+      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                    \
     }                                                                         \
     for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC * 2; ++i) {  \
-      ASSERT_EQ(dst_uv_c[i], dst_uv_opt[i]);                                  \
+      EXPECT_EQ(dst_uv_c[i], dst_uv_opt[i]);                                  \
     }                                                                         \
     free_aligned_buffer_page_end(dst_y_c);                                    \
     free_aligned_buffer_page_end(dst_uv_c);                                   \
@@ -480,15 +478,14 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
         (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                   \
     const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);   \
     const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \
-    align_buffer_page_end(src_y,                                              \
-                          kPaddedWidth * kPaddedHeight * SRC_BPC + OFF);      \
+    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \
     align_buffer_page_end(                                                    \
         src_uv,                                                               \
         2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF);      \
-    align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC);               \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
     align_buffer_page_end(dst_uv_c,                                           \
                           2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);      \
-    align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC);             \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
     align_buffer_page_end(dst_uv_opt,                                         \
                           2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);      \
     SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
@@ -505,13 +502,13 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
       src_uv_p[i] =                                                           \
           (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));      \
     }                                                                         \
-    memset(dst_y_c, 1, kWidth * kHeight * DST_BPC);                           \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
     memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);        \
-    memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC);                       \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
     memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);    \
     MaskCpuFlags(disable_cpu_flags_);                                         \
     SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
-        src_y_p, kWidth * SRC_BPC / (int)sizeof(SRC_T), src_uv_p,             \
+        src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p,              \
         2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T),                     \
         DOY ? reinterpret_cast<DST_T*>(dst_y_c) : NULL, kWidth,               \
         reinterpret_cast<DST_T*>(dst_uv_c), 2 * kDstHalfWidth, kWidth,        \
@@ -519,7 +516,7 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
     MaskCpuFlags(benchmark_cpu_info_);                                        \
     for (int i = 0; i < benchmark_iterations_; ++i) {                         \
       SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
-          src_y_p, kWidth * SRC_BPC / (int)sizeof(SRC_T), src_uv_p,           \
+          src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p,            \
           2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T),                   \
           DOY ? reinterpret_cast<DST_T*>(dst_y_opt) : NULL, kWidth,           \
           reinterpret_cast<DST_T*>(dst_uv_opt), 2 * kDstHalfWidth, kWidth,    \
@@ -528,13 +525,13 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
     if (DOY) {                                                                \
       for (int i = 0; i < kHeight; ++i) {                                     \
         for (int j = 0; j < kWidth; ++j) {                                    \
-          ASSERT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);      \
+          EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);      \
         }                                                                     \
       }                                                                       \
     }                                                                         \
     for (int i = 0; i < kDstHalfHeight; ++i) {                                \
       for (int j = 0; j < 2 * kDstHalfWidth; ++j) {                           \
-        ASSERT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j],                        \
+        EXPECT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j],                        \
                   dst_uv_opt[i * 2 * kDstHalfWidth + j]);                     \
       }                                                                       \
     }                                                                         \
@@ -601,16 +598,16 @@ TESTBPTOBP(P010, uint16_t, 2, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1)
     const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
     const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
-    align_buffer_page_end(src_argb, kStride * kHeight + OFF);                  \
-    align_buffer_page_end(dst_y_c, kWidth * kHeight);                          \
+    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
     align_buffer_page_end(dst_uv_c,                                            \
                           kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                        \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
     align_buffer_page_end(dst_uv_opt,                                          \
                           kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    memset(dst_y_c, 1, kWidth * kHeight);                                      \
+    memset(dst_y_c, 1, kWidth* kHeight);                                       \
     memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    memset(dst_y_opt, 101, kWidth * kHeight);                                  \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                   \
     memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));    \
     for (int i = 0; i < kHeight; ++i)                                          \
       for (int j = 0; j < kStride; ++j)                                        \
@@ -627,12 +624,12 @@ TESTBPTOBP(P010, uint16_t, 2, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1)
     }                                                                          \
     for (int i = 0; i < kHeight; ++i) {                                        \
       for (int j = 0; j < kWidth; ++j) {                                       \
-        ASSERT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);         \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);         \
       }                                                                        \
     }                                                                          \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) {              \
       for (int j = 0; j < kStrideUV; ++j) {                                    \
-        ASSERT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \
+        EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \
       }                                                                        \
     }                                                                          \
     free_aligned_buffer_page_end(dst_y_c);                                     \
@@ -694,20 +691,20 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1)
     const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
     const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
-    align_buffer_page_end(src_argb, kStride * kHeight + OFF);                  \
-    align_buffer_page_end(dst_a_c, kWidth * kHeight);                          \
-    align_buffer_page_end(dst_y_c, kWidth * kHeight);                          \
+    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
+    align_buffer_page_end(dst_a_c, kWidth* kHeight);                           \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
     align_buffer_page_end(dst_uv_c,                                            \
                           kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    align_buffer_page_end(dst_a_opt, kWidth * kHeight);                        \
-    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                        \
+    align_buffer_page_end(dst_a_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
     align_buffer_page_end(dst_uv_opt,                                          \
                           kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    memset(dst_a_c, 1, kWidth * kHeight);                                      \
-    memset(dst_y_c, 2, kWidth * kHeight);                                      \
+    memset(dst_a_c, 1, kWidth* kHeight);                                       \
+    memset(dst_y_c, 2, kWidth* kHeight);                                       \
     memset(dst_uv_c, 3, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    memset(dst_a_opt, 101, kWidth * kHeight);                                  \
-    memset(dst_y_opt, 102, kWidth * kHeight);                                  \
+    memset(dst_a_opt, 101, kWidth* kHeight);                                   \
+    memset(dst_y_opt, 102, kWidth* kHeight);                                   \
     memset(dst_uv_opt, 103, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));    \
     for (int i = 0; i < kHeight; ++i)                                          \
       for (int j = 0; j < kStride; ++j)                                        \
@@ -725,13 +722,13 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1)
     }                                                                          \
     for (int i = 0; i < kHeight; ++i) {                                        \
       for (int j = 0; j < kWidth; ++j) {                                       \
-        ASSERT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);         \
-        ASSERT_EQ(dst_a_c[i * kWidth + j], dst_a_opt[i * kWidth + j]);         \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);         \
+        EXPECT_EQ(dst_a_c[i * kWidth + j], dst_a_opt[i * kWidth + j]);         \
       }                                                                        \
     }                                                                          \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) {              \
       for (int j = 0; j < kStrideUV; ++j) {                                    \
-        ASSERT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \
+        EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \
       }                                                                        \
     }                                                                          \
     free_aligned_buffer_page_end(dst_a_c);                                     \
@@ -768,19 +765,19 @@ TESTATOPLANARA(ARGB, 4, 1, I420Alpha, 2, 2)
     const int kHeight = benchmark_height_;                                    \
     const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
-    align_buffer_page_end(src_argb, kStride * kHeight + OFF);                 \
-    align_buffer_page_end(dst_y_c, kWidth * kHeight);                         \
+    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                  \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
     align_buffer_page_end(dst_uv_c,                                           \
                           kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                       \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
     align_buffer_page_end(dst_uv_opt,                                         \
                           kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
     for (int i = 0; i < kHeight; ++i)                                         \
       for (int j = 0; j < kStride; ++j)                                       \
         src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);              \
-    memset(dst_y_c, 1, kWidth * kHeight);                                     \
+    memset(dst_y_c, 1, kWidth* kHeight);                                      \
     memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));       \
-    memset(dst_y_opt, 101, kWidth * kHeight);                                 \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
     memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));   \
     MaskCpuFlags(disable_cpu_flags_);                                         \
     FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
@@ -792,12 +789,12 @@ TESTATOPLANARA(ARGB, 4, 1, I420Alpha, 2, 2)
     }                                                                         \
     for (int i = 0; i < kHeight; ++i) {                                       \
       for (int j = 0; j < kWidth; ++j) {                                      \
-        ASSERT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
       }                                                                       \
     }                                                                         \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
       for (int j = 0; j < kStrideUV * 2; ++j) {                               \
-        ASSERT_EQ(dst_uv_c[i * kStrideUV * 2 + j],                            \
+        EXPECT_EQ(dst_uv_c[i * kStrideUV * 2 + j],                            \
                   dst_uv_opt[i * kStrideUV * 2 + j]);                         \
       }                                                                       \
     }                                                                         \
@@ -847,11 +844,11 @@ TEST_F(LibYUVConvertTest, ValidateJpeg) {
 
   // No SOI or EOI. Expect fail.
   memset(orig_pixels, 0, kSize);
-  ASSERT_FALSE(ValidateJpeg(orig_pixels, kSize));
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
 
   // Test special value that matches marker start.
   memset(orig_pixels, 0xff, kSize);
-  ASSERT_FALSE(ValidateJpeg(orig_pixels, kSize));
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
 
   // EOI, SOI. Expect pass.
   orig_pixels[0] = 0xff;
@@ -860,7 +857,7 @@ TEST_F(LibYUVConvertTest, ValidateJpeg) {
   orig_pixels[kSize - kOff + 0] = 0xff;
   orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
   for (int times = 0; times < benchmark_iterations_; ++times) {
-    ASSERT_TRUE(ValidateJpeg(orig_pixels, kSize));
+    EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
   }
   free_aligned_buffer_page_end(orig_pixels);
 }
@@ -878,7 +875,7 @@ TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
 
   // No SOI or EOI. Expect fail.
   memset(orig_pixels, 0, kBufSize);
-  ASSERT_FALSE(ValidateJpeg(orig_pixels, kBufSize));
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kBufSize));
 
   // EOI, SOI. Expect pass.
   orig_pixels[0] = 0xff;
@@ -887,7 +884,7 @@ TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
   orig_pixels[kSize - kOff + 0] = 0xff;
   orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
   for (int times = 0; times < benchmark_iterations_; ++times) {
-    ASSERT_TRUE(ValidateJpeg(orig_pixels, kBufSize));
+    EXPECT_TRUE(ValidateJpeg(orig_pixels, kBufSize));
   }
   free_aligned_buffer_page_end(orig_pixels);
 }
@@ -902,24 +899,24 @@ TEST_F(LibYUVConvertTest, InvalidateJpeg) {
   align_buffer_page_end(orig_pixels, kSize);
 
   // NULL pointer. Expect fail.
-  ASSERT_FALSE(ValidateJpeg(NULL, kSize));
+  EXPECT_FALSE(ValidateJpeg(NULL, kSize));
 
   // Negative size. Expect fail.
-  ASSERT_FALSE(ValidateJpeg(orig_pixels, -1));
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, -1));
 
   // Too large size. Expect fail.
-  ASSERT_FALSE(ValidateJpeg(orig_pixels, 0xfb000000ull));
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, 0xfb000000ull));
 
   // No SOI or EOI. Expect fail.
   memset(orig_pixels, 0, kSize);
-  ASSERT_FALSE(ValidateJpeg(orig_pixels, kSize));
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
 
   // SOI but no EOI. Expect fail.
   orig_pixels[0] = 0xff;
   orig_pixels[1] = 0xd8;  // SOI.
   orig_pixels[2] = 0xff;
   for (int times = 0; times < benchmark_iterations_; ++times) {
-    ASSERT_FALSE(ValidateJpeg(orig_pixels, kSize));
+    EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
   }
 
   // EOI but no SOI. Expect fail.
@@ -927,7 +924,7 @@ TEST_F(LibYUVConvertTest, InvalidateJpeg) {
   orig_pixels[1] = 0;
   orig_pixels[kSize - kOff + 0] = 0xff;
   orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
-  ASSERT_FALSE(ValidateJpeg(orig_pixels, kSize));
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
 
   free_aligned_buffer_page_end(orig_pixels);
 }
@@ -1251,7 +1248,7 @@ TEST_F(LibYUVConvertTest, TestMJPGSize) {
   int width = 0;
   int height = 0;
   int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   printf("test jpeg size %d x %d\n", width, height);
 }
@@ -1260,7 +1257,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420) {
   int width = 0;
   int height = 0;
   int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   int half_width = (width + 1) / 2;
   int half_height = (height + 1) / 2;
@@ -1278,15 +1275,15 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420) {
                      dst_v, half_width, width, height, width, height);
   }
   // Expect sucesss
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   // Test result matches known hash value.
   uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
   uint32_t dst_u_hash = HashDjb2(dst_u, half_width * half_height, 5381);
   uint32_t dst_v_hash = HashDjb2(dst_v, half_width * half_height, 5381);
-  ASSERT_EQ(dst_y_hash, 2682851208u);
-  ASSERT_EQ(dst_u_hash, 2501859930u);
-  ASSERT_EQ(dst_v_hash, 2126459123u);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_u_hash, 2501859930u);
+  EXPECT_EQ(dst_v_hash, 2126459123u);
 
   free_aligned_buffer_page_end(dst_y);
   free_aligned_buffer_page_end(dst_u);
@@ -1297,7 +1294,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
   int width = 0;
   int height = 0;
   int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   int half_width = (width + 1) / 2;
   int half_height = (height + 1) / 2;
@@ -1316,7 +1313,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
                      half_width * 2, width, height, width, height);
   }
   // Expect sucesss
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   // Convert to I420
   align_buffer_page_end(dst2_y, width * height);
@@ -1327,7 +1324,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
                      dst2_v, half_width, width, height, width, height);
   }
   // Expect sucesss
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   // Convert I420 to NV21
   align_buffer_page_end(dst3_y, width * height);
@@ -1337,11 +1334,11 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
              width, dst3_vu, half_width * 2, width, height);
 
   for (int i = 0; i < width * height; ++i) {
-    ASSERT_EQ(dst_y[i], dst3_y[i]);
+    EXPECT_EQ(dst_y[i], dst3_y[i]);
   }
   for (int i = 0; i < half_width * half_height * 2; ++i) {
-    ASSERT_EQ(dst_vu[i], dst3_vu[i]);
-    ASSERT_EQ(dst_vu[i], dst3_vu[i]);
+    EXPECT_EQ(dst_vu[i], dst3_vu[i]);
+    EXPECT_EQ(dst_vu[i], dst3_vu[i]);
   }
 
   free_aligned_buffer_page_end(dst3_y);
@@ -1359,7 +1356,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV12) {
   int width = 0;
   int height = 0;
   int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   int half_width = (width + 1) / 2;
   int half_height = (height + 1) / 2;
@@ -1378,7 +1375,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV12) {
                      half_width * 2, width, height, width, height);
   }
   // Expect sucesss
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   // Convert to I420
   align_buffer_page_end(dst2_y, width * height);
@@ -1389,7 +1386,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV12) {
                      dst2_v, half_width, width, height, width, height);
   }
   // Expect sucesss
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   // Convert I420 to NV12
   align_buffer_page_end(dst3_y, width * height);
@@ -1399,11 +1396,11 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV12) {
              width, dst3_uv, half_width * 2, width, height);
 
   for (int i = 0; i < width * height; ++i) {
-    ASSERT_EQ(dst_y[i], dst3_y[i]);
+    EXPECT_EQ(dst_y[i], dst3_y[i]);
   }
   for (int i = 0; i < half_width * half_height * 2; ++i) {
-    ASSERT_EQ(dst_uv[i], dst3_uv[i]);
-    ASSERT_EQ(dst_uv[i], dst3_uv[i]);
+    EXPECT_EQ(dst_uv[i], dst3_uv[i]);
+    EXPECT_EQ(dst_uv[i], dst3_uv[i]);
   }
 
   free_aligned_buffer_page_end(dst3_y);
@@ -1421,7 +1418,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
   int width = 0;
   int height = 0;
   int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   int half_width = (width + 1) / 2;
   int half_height = (height + 1) / 2;
@@ -1438,13 +1435,13 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
                      half_width * 2, width, height, width, height);
   }
   // Expect sucesss
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   // Test result matches known hash value.
   uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
   uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
-  ASSERT_EQ(dst_y_hash, 2682851208u);
-  ASSERT_EQ(dst_uv_hash, 1069662856u);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_uv_hash, 1069662856u);
 
   free_aligned_buffer_page_end(dst_y);
   free_aligned_buffer_page_end(dst_uv);
@@ -1454,7 +1451,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_420) {
   int width = 0;
   int height = 0;
   int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   int half_width = (width + 1) / 2;
   int half_height = (height + 1) / 2;
@@ -1471,7 +1468,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_420) {
                      half_width * 2, width, height, width, height);
   }
   // Expect sucesss
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   // Test result matches known hash value. Hashes are for VU so flip the plane.
   uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
@@ -1479,8 +1476,8 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_420) {
   SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
               half_height);
   uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
-  ASSERT_EQ(dst_y_hash, 2682851208u);
-  ASSERT_EQ(dst_vu_hash, 1069662856u);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_vu_hash, 1069662856u);
 
   free_aligned_buffer_page_end(dst_y);
   free_aligned_buffer_page_end(dst_uv);
@@ -1492,7 +1489,7 @@ TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV21_422) {
   int width = 0;
   int height = 0;
   int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   int half_width = (width + 1) / 2;
   int half_height = (height + 1) / 2;
@@ -1509,13 +1506,13 @@ TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV21_422) {
                      half_width * 2, width, height, width, height);
   }
   // Expect sucesss
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   // Test result matches known hash value.
   uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
   uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
-  ASSERT_EQ(dst_y_hash, 2682851208u);
-  ASSERT_EQ(dst_uv_hash, 493520167u);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_uv_hash, 493520167u);
 
   free_aligned_buffer_page_end(dst_y);
   free_aligned_buffer_page_end(dst_uv);
@@ -1525,7 +1522,7 @@ TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV12_422) {
   int width = 0;
   int height = 0;
   int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   int half_width = (width + 1) / 2;
   int half_height = (height + 1) / 2;
@@ -1542,7 +1539,7 @@ TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV12_422) {
                      half_width * 2, width, height, width, height);
   }
   // Expect sucesss
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   // Test result matches known hash value. Hashes are for VU so flip the plane.
   uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
@@ -1550,8 +1547,8 @@ TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV12_422) {
   SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
               half_height);
   uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
-  ASSERT_EQ(dst_y_hash, 2682851208u);
-  ASSERT_EQ(dst_vu_hash, 493520167u);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_vu_hash, 493520167u);
 
   free_aligned_buffer_page_end(dst_y);
   free_aligned_buffer_page_end(dst_uv);
@@ -1562,7 +1559,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
   int width = 0;
   int height = 0;
   int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   int half_width = (width + 1) / 2;
   int half_height = (height + 1) / 2;
@@ -1579,13 +1576,13 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
                      half_width * 2, width, height, width, height);
   }
   // Expect sucesss
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   // Test result matches known hash value.
   uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
   uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
-  ASSERT_EQ(dst_y_hash, 330644005u);
-  ASSERT_EQ(dst_uv_hash, 135214341u);
+  EXPECT_EQ(dst_y_hash, 330644005u);
+  EXPECT_EQ(dst_uv_hash, 135214341u);
 
   free_aligned_buffer_page_end(dst_y);
   free_aligned_buffer_page_end(dst_uv);
@@ -1595,7 +1592,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_400) {
   int width = 0;
   int height = 0;
   int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   int half_width = (width + 1) / 2;
   int half_height = (height + 1) / 2;
@@ -1612,7 +1609,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_400) {
                      half_width * 2, width, height, width, height);
   }
   // Expect sucesss
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   // Test result matches known hash value. Hashes are for VU so flip the plane.
   uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
@@ -1620,8 +1617,8 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_400) {
   SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
               half_height);
   uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
-  ASSERT_EQ(dst_y_hash, 330644005u);
-  ASSERT_EQ(dst_vu_hash, 135214341u);
+  EXPECT_EQ(dst_y_hash, 330644005u);
+  EXPECT_EQ(dst_vu_hash, 135214341u);
 
   free_aligned_buffer_page_end(dst_y);
   free_aligned_buffer_page_end(dst_uv);
@@ -1632,7 +1629,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
   int width = 0;
   int height = 0;
   int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   int half_width = (width + 1) / 2;
   int half_height = (height + 1) / 2;
@@ -1649,13 +1646,13 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
                      half_width * 2, width, height, width, height);
   }
   // Expect sucesss
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   // Test result matches known hash value.
   uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
   uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
-  ASSERT_EQ(dst_y_hash, 2682851208u);
-  ASSERT_EQ(dst_uv_hash, 506143297u);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_uv_hash, 506143297u);
 
   free_aligned_buffer_page_end(dst_y);
   free_aligned_buffer_page_end(dst_uv);
@@ -1665,7 +1662,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_444) {
   int width = 0;
   int height = 0;
   int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   int half_width = (width + 1) / 2;
   int half_height = (height + 1) / 2;
@@ -1682,7 +1679,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_444) {
                      half_width * 2, width, height, width, height);
   }
   // Expect sucesss
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   // Test result matches known hash value. Hashes are for VU so flip the plane.
   uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
@@ -1690,8 +1687,8 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_444) {
   SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width,
               half_height);
   uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381);
-  ASSERT_EQ(dst_y_hash, 2682851208u);
-  ASSERT_EQ(dst_vu_hash, 506143297u);
+  EXPECT_EQ(dst_y_hash, 2682851208u);
+  EXPECT_EQ(dst_vu_hash, 506143297u);
 
   free_aligned_buffer_page_end(dst_y);
   free_aligned_buffer_page_end(dst_uv);
@@ -1702,7 +1699,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
   int width = 0;
   int height = 0;
   int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
                              benchmark_height_ / (width * height);
@@ -1716,14 +1713,14 @@ TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
                      height, width, height);
   }
   // Expect sucesss
-  ASSERT_EQ(0, ret);
+  EXPECT_EQ(0, ret);
 
   // Test result matches known hash value.
   uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381);
 #ifdef LIBYUV_UNLIMITED_DATA
-  ASSERT_EQ(dst_argb_hash, 3900633302u);
+  EXPECT_EQ(dst_argb_hash, 3900633302u);
 #else
-  ASSERT_EQ(dst_argb_hash, 2355976473u);
+  EXPECT_EQ(dst_argb_hash, 2355976473u);
 #endif
 
   free_aligned_buffer_page_end(dst_argb);
@@ -1786,11 +1783,11 @@ static int ShowJPegInfo(const uint8_t* sample, size_t sample_size) {
 }
 
 TEST_F(LibYUVConvertTest, TestMJPGInfo) {
-  ASSERT_EQ(1, ShowJPegInfo(kTest0Jpg, kTest0JpgLen));
-  ASSERT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen));
-  ASSERT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen));
-  ASSERT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen));
-  ASSERT_EQ(1, ShowJPegInfo(kTest4Jpg,
+  EXPECT_EQ(1, ShowJPegInfo(kTest0Jpg, kTest0JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen));
+  EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg,
                             kTest4JpgLen));  // Valid but unsupported.
 }
 #endif  // HAVE_JPEG
@@ -1851,18 +1848,18 @@ TEST_F(LibYUVConvertTest, NV12Crop) {
 
   for (int i = 0; i < kDestHeight; ++i) {
     for (int j = 0; j < kDestWidth; ++j) {
-      ASSERT_EQ(dst_y[i * kWidth + j], dst_y_2[i * kWidth + j]);
+      EXPECT_EQ(dst_y[i * kWidth + j], dst_y_2[i * kWidth + j]);
     }
   }
   for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
     for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
-      ASSERT_EQ(dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+      EXPECT_EQ(dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
                 dst_u_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
     }
   }
   for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
     for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
-      ASSERT_EQ(dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
+      EXPECT_EQ(dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j],
                 dst_v_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
     }
   }
@@ -1924,19 +1921,19 @@ TEST_F(LibYUVConvertTest, I420CropOddY) {
 
   for (int i = 0; i < kDestHeight; ++i) {
     for (int j = 0; j < kDestWidth; ++j) {
-      ASSERT_EQ(src_y[crop_y * kWidth + i * kWidth + j],
+      EXPECT_EQ(src_y[crop_y * kWidth + i * kWidth + j],
                 dst_y[i * kDestWidth + j]);
     }
   }
   for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
     for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
-      ASSERT_EQ(src_u[(crop_y / 2 + i) * kStrideU + j],
+      EXPECT_EQ(src_u[(crop_y / 2 + i) * kStrideU + j],
                 dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
     }
   }
   for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
     for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
-      ASSERT_EQ(src_v[(crop_y / 2 + i) * kStrideV + j],
+      EXPECT_EQ(src_v[(crop_y / 2 + i) * kStrideV + j],
                 dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
     }
   }
@@ -1953,17 +1950,17 @@ TEST_F(LibYUVConvertTest, I420CropOddY) {
     const int kHeight = benchmark_height_;                                    \
                                                                               \
     align_buffer_page_end(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight);     \
-    align_buffer_page_end(orig_y, kWidth * kHeight);                          \
+    align_buffer_page_end(orig_y, kWidth* kHeight);                           \
     align_buffer_page_end(orig_u,                                             \
                           SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));      \
     align_buffer_page_end(orig_v,                                             \
                           SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));      \
                                                                               \
-    align_buffer_page_end(dst_y_orig, kWidth * kHeight);                      \
+    align_buffer_page_end(dst_y_orig, kWidth* kHeight);                       \
     align_buffer_page_end(dst_uv_orig,                                        \
                           2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));  \
                                                                               \
-    align_buffer_page_end(dst_y, kWidth * kHeight);                           \
+    align_buffer_page_end(dst_y, kWidth* kHeight);                            \
     align_buffer_page_end(dst_uv,                                             \
                           2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));  \
                                                                               \
@@ -1984,14 +1981,14 @@ TEST_F(LibYUVConvertTest, I420CropOddY) {
     }                                                                         \
                                                                               \
     for (int i = 0; i < kWidth * kHeight; ++i) {                              \
-      ASSERT_EQ(orig_y[i], dst_y[i]);                                         \
+      EXPECT_EQ(orig_y[i], dst_y[i]);                                         \
     }                                                                         \
     for (int i = 0; i < kWidth * kHeight; ++i) {                              \
-      ASSERT_EQ(dst_y_orig[i], dst_y[i]);                                     \
+      EXPECT_EQ(dst_y_orig[i], dst_y[i]);                                     \
     }                                                                         \
     for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2);     \
          ++i) {                                                               \
-      ASSERT_EQ(dst_uv_orig[i], dst_uv[i]);                                   \
+      EXPECT_EQ(dst_uv_orig[i], dst_uv[i]);                                   \
     }                                                                         \
                                                                               \
     free_aligned_buffer_page_end(orig_uyvy);                                  \
@@ -2040,7 +2037,7 @@ TEST_F(LibYUVConvertTest, MM21ToYUY2) {
   }
 
   for (int i = 0; i < 4 * SUBSAMPLE(kWidth, 2) * kHeight; ++i) {
-    ASSERT_EQ(dst_yuyv[i], golden_yuyv[i]);
+    EXPECT_EQ(dst_yuyv[i], golden_yuyv[i]);
   }
 
   free_aligned_buffer_page_end(orig_y);
@@ -2053,6 +2050,7 @@ TEST_F(LibYUVConvertTest, MM21ToYUY2) {
 }
 
 // Test RGB24 to J420 is exact
+#if defined(LIBYUV_BIT_EXACT)
 TEST_F(LibYUVConvertTest, TestRGB24ToJ420) {
   const int kSize = 256;
   align_buffer_page_end(orig_rgb24, kSize * 3 * 2);  // 2 rows of RGB24
@@ -2072,13 +2070,15 @@ TEST_F(LibYUVConvertTest, TestRGB24ToJ420) {
   }
 
   uint32_t checksum = HashDjb2(dest_j420, kSize * 3 / 2 * 2, 5381);
-  ASSERT_EQ(223551344u, checksum);
+  EXPECT_EQ(223551344u, checksum);
 
   free_aligned_buffer_page_end(orig_rgb24);
   free_aligned_buffer_page_end(dest_j420);
 }
+#endif
 
 // Test RGB24 to I420 is exact
+#if defined(LIBYUV_BIT_EXACT)
 TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
   const int kSize = 256;
   align_buffer_page_end(orig_rgb24, kSize * 3 * 2);  // 2 rows of RGB24
@@ -2098,11 +2098,12 @@ TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
   }
 
   uint32_t checksum = HashDjb2(dest_i420, kSize * 3 / 2 * 2, 5381);
-  ASSERT_EQ(4197774805u, checksum);
+  EXPECT_EQ(4197774805u, checksum);
 
   free_aligned_buffer_page_end(orig_rgb24);
   free_aligned_buffer_page_end(dest_i420);
 }
+#endif
 
 TEST_F(LibYUVConvertTest, TestJ420ToI420) {
   const uint8_t src_y[12] = {0, 0, 128, 128, 255, 255,
@@ -2115,15 +2116,15 @@ TEST_F(LibYUVConvertTest, TestJ420ToI420) {
   ASSERT_EQ(J420ToI420(src_y, 6, src_u, 3, src_v, 3, dst_y, 6, dst_u, 3, dst_v,
                        3, 6, 2),
             0);
-  ASSERT_EQ(dst_y[0], 16);
-  ASSERT_EQ(dst_y[2], 126);
-  ASSERT_EQ(dst_y[4], 235);
-  ASSERT_EQ(dst_u[0], 16);
-  ASSERT_EQ(dst_u[1], 128);
-  ASSERT_EQ(dst_u[2], 240);
-  ASSERT_EQ(dst_v[0], 16);
-  ASSERT_EQ(dst_v[1], 128);
-  ASSERT_EQ(dst_v[2], 240);
+  EXPECT_EQ(dst_y[0], 16);
+  EXPECT_EQ(dst_y[2], 126);
+  EXPECT_EQ(dst_y[4], 235);
+  EXPECT_EQ(dst_u[0], 16);
+  EXPECT_EQ(dst_u[1], 128);
+  EXPECT_EQ(dst_u[2], 240);
+  EXPECT_EQ(dst_v[0], 16);
+  EXPECT_EQ(dst_v[1], 128);
+  EXPECT_EQ(dst_v[2], 240);
 }
 
 TEST_F(LibYUVConvertTest, TestABGRToI420Matrix) {
@@ -2176,6 +2177,42 @@ TEST_F(LibYUVConvertTest, TestABGRToI420Matrix) {
   free_aligned_buffer_page_end(ref_v);
 }
 
+TEST_F(LibYUVConvertTest, TestABGRToI422Matrix) {
+  const int kWidth = 16;
+  const int kHeight = 16;
+  align_buffer_page_end(src_abgr, kWidth * kHeight * 4);
+  align_buffer_page_end(dst_y, kWidth * kHeight);
+  align_buffer_page_end(dst_u, kWidth / 2 * kHeight);
+  align_buffer_page_end(dst_v, kWidth / 2 * kHeight);
+
+  MemRandomize(src_abgr, kWidth * kHeight * 4);
+
+  // JPEG
+  ARGBToI422Matrix(src_abgr, kWidth * 4, dst_y, kWidth, dst_u, kWidth / 2,
+                   dst_v, kWidth / 2, &kAbgrJPEGConstants, kWidth, kHeight);
+  // Verify against non-matrix version
+  align_buffer_page_end(ref_y, kWidth * kHeight);
+  align_buffer_page_end(ref_u, kWidth / 2 * kHeight);
+  align_buffer_page_end(ref_v, kWidth / 2 * kHeight);
+  ABGRToJ422(src_abgr, kWidth * 4, ref_y, kWidth, ref_u, kWidth / 2, ref_v,
+             kWidth / 2, kWidth, kHeight);
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    ASSERT_EQ(dst_y[i], ref_y[i]);
+  }
+  for (int i = 0; i < kWidth / 2 * kHeight; ++i) {
+    ASSERT_EQ(dst_u[i], ref_u[i]);
+    ASSERT_EQ(dst_v[i], ref_v[i]);
+  }
+
+  free_aligned_buffer_page_end(src_abgr);
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_u);
+  free_aligned_buffer_page_end(dst_v);
+  free_aligned_buffer_page_end(ref_y);
+  free_aligned_buffer_page_end(ref_u);
+  free_aligned_buffer_page_end(ref_v);
+}
+
 TEST_F(LibYUVConvertTest, TestARGBToNV12Matrix) {
   const int kWidth = 16;
   const int kHeight = 16;
@@ -2290,18 +2327,17 @@ TEST_F(LibYUVConvertTest, TestARGBToI420Matrix) {
                    dst_v, kWidth / 2, &kArgbU2020Constants, kWidth, kHeight);
 
   // Reference BT.709 (limited range)
-  // Y = round(0.2126 * 219 / 255 * R + 0.7152 * 219 / 255 * G + 0.0722 * 219 /
-  // 255 * B + 16) Y = round(0.1826 * R + 0.6142 * G + 0.0620 * B + 16) 47 * 255
-  // + 157 * 255 + 16 * 255 + 4224 = 11985 + 40035 + 4080 + 4224 = 60324 60324 /
-  // 256 = 235.64 -> 235. Correct.
+  // Y = round(0.2126 * 219 / 255 * R + 0.7152 * 219 / 255 * G + 0.0722 * 219 / 255 * B + 16)
+  // Y = round(0.1826 * R + 0.6142 * G + 0.0620 * B + 16)
+  // 47 * 255 + 157 * 255 + 16 * 255 + 4224 = 11985 + 40035 + 4080 + 4224 = 60324
+  // 60324 / 256 = 235.64 -> 235. Correct.
 
-  for (int i = 0; i < kWidth * kHeight * 4; ++i)
-    src_argb[i] = 255;
+  for (int i = 0; i < kWidth * kHeight * 4; ++i) src_argb[i] = 255;
   ARGBToI420Matrix(src_argb, kWidth * 4, dst_y, kWidth, dst_u, kWidth / 2,
                    dst_v, kWidth / 2, &kArgbH709Constants, kWidth, kHeight);
-  ASSERT_EQ(dst_y[0], 235);
-  ASSERT_EQ(dst_u[0], 128);
-  ASSERT_EQ(dst_v[0], 128);
+  EXPECT_EQ(dst_y[0], 235);
+  EXPECT_EQ(dst_u[0], 128);
+  EXPECT_EQ(dst_v[0], 128);
 
   for (int i = 0; i < kWidth * kHeight * 4; i += 4) {
     src_argb[i + 0] = 0;    // B
@@ -2312,11 +2348,11 @@ TEST_F(LibYUVConvertTest, TestARGBToI420Matrix) {
   ARGBToI420Matrix(src_argb, kWidth * 4, dst_y, kWidth, dst_u, kWidth / 2,
                    dst_v, kWidth / 2, &kArgbH709Constants, kWidth, kHeight);
   // Y = 47 * 255 + 4224 = 11985 + 4224 = 16209. 16209 / 256 = 63.3 -> 63.
-  ASSERT_EQ(dst_y[0], 63);
+  EXPECT_EQ(dst_y[0], 63);
   // U = -26 * 255 + 32768 = -6630 + 32768 = 26138. 26138 / 256 = 102.1 -> 102.
-  ASSERT_EQ(dst_u[0], 102);
+  EXPECT_EQ(dst_u[0], 102);
   // V = 112 * 255 + 32768 = 28560 + 32768 = 61328. 61328 / 256 = 239.5 -> 239.
-  ASSERT_EQ(dst_v[0], 239);
+  EXPECT_EQ(dst_v[0], 239);
 
   free_aligned_buffer_page_end(src_argb);
   free_aligned_buffer_page_end(dst_y);
@@ -2427,132 +2463,6 @@ TEST_F(LibYUVConvertTest, TestARGBToI444Matrix) {
   free_aligned_buffer_page_end(ref_v);
 }
 
-template <typename ConvertToYUV, typename ConvertToARGB>
-static void TestRGBToI420(ConvertToYUV convert_to_yuv,
-                          ConvertToARGB convert_to_argb,
-                          int width,
-                          int height,
-                          int disable_cpu_flags,
-                          int benchmark_cpu_info) {
-  align_buffer_page_end(src_rgb, width * height * 4);
-  align_buffer_page_end(dst_y, width * height);
-  align_buffer_page_end(dst_u, (width + 1) / 2 * (height + 1) / 2);
-  align_buffer_page_end(dst_v, (width + 1) / 2 * (height + 1) / 2);
-
-  align_buffer_page_end(tmp_argb, width * height * 4);
-  align_buffer_page_end(ref_y, width * height);
-  align_buffer_page_end(ref_u, (width + 1) / 2 * (height + 1) / 2);
-  align_buffer_page_end(ref_v, (width + 1) / 2 * (height + 1) / 2);
-
-  MemRandomize(src_rgb, width * height * 4);
-
-  {
-    SCOPED_TRACE("C_Version");
-    MaskCpuFlags(disable_cpu_flags);
-
-    // Clear buffers
-    memset(dst_y, 0, width * height);
-    memset(dst_u, 0, (width + 1) / 2 * (height + 1) / 2);
-    memset(dst_v, 0, (width + 1) / 2 * (height + 1) / 2);
-    memset(ref_y, 0, width * height);
-    memset(ref_u, 0, (width + 1) / 2 * (height + 1) / 2);
-    memset(ref_v, 0, (width + 1) / 2 * (height + 1) / 2);
-    memset(tmp_argb, 0, width * height * 4);
-
-    int r1 =
-        convert_to_yuv(src_rgb, width * 4, dst_y, width, dst_u, (width + 1) / 2,
-                       dst_v, (width + 1) / 2, width, height);
-    ASSERT_EQ(r1, 0);
-
-    int r2 =
-        convert_to_argb(src_rgb, width * 4, tmp_argb, width * 4, width, height);
-    ASSERT_EQ(r2, 0);
-
-    int r3 = ARGBToI420(tmp_argb, width * 4, ref_y, width, ref_u,
-                        (width + 1) / 2, ref_v, (width + 1) / 2, width, height);
-    ASSERT_EQ(r3, 0);
-
-    for (int i = 0; i < width * height; ++i) {
-      ASSERT_EQ(dst_y[i], ref_y[i]);
-    }
-    for (int i = 0; i < (width + 1) / 2 * (height + 1) / 2; ++i) {
-      ASSERT_EQ(dst_u[i], ref_u[i]);
-      ASSERT_EQ(dst_v[i], ref_v[i]);
-    }
-  }
-
-  {
-    SCOPED_TRACE("SIMD_Version");
-    MaskCpuFlags(benchmark_cpu_info);
-
-    // Clear buffers
-    memset(dst_y, 0, width * height);
-    memset(dst_u, 0, (width + 1) / 2 * (height + 1) / 2);
-    memset(dst_v, 0, (width + 1) / 2 * (height + 1) / 2);
-    memset(ref_y, 0, width * height);
-    memset(ref_u, 0, (width + 1) / 2 * (height + 1) / 2);
-    memset(ref_v, 0, (width + 1) / 2 * (height + 1) / 2);
-    memset(tmp_argb, 0, width * height * 4);
-
-    int r1 =
-        convert_to_yuv(src_rgb, width * 4, dst_y, width, dst_u, (width + 1) / 2,
-                       dst_v, (width + 1) / 2, width, height);
-    ASSERT_EQ(r1, 0);
-
-    int r2 =
-        convert_to_argb(src_rgb, width * 4, tmp_argb, width * 4, width, height);
-    ASSERT_EQ(r2, 0);
-
-    int r3 = ARGBToI420(tmp_argb, width * 4, ref_y, width, ref_u,
-                        (width + 1) / 2, ref_v, (width + 1) / 2, width, height);
-    ASSERT_EQ(r3, 0);
-
-    for (int i = 0; i < width * height; ++i) {
-      ASSERT_EQ(dst_y[i], ref_y[i]);
-    }
-    for (int i = 0; i < (width + 1) / 2 * (height + 1) / 2; ++i) {
-      ASSERT_EQ(dst_u[i], ref_u[i]);
-      ASSERT_EQ(dst_v[i], ref_v[i]);
-    }
-  }
-
-  free_aligned_buffer_page_end(src_rgb);
-  free_aligned_buffer_page_end(dst_y);
-  free_aligned_buffer_page_end(dst_u);
-  free_aligned_buffer_page_end(dst_v);
-  free_aligned_buffer_page_end(tmp_argb);
-  free_aligned_buffer_page_end(ref_y);
-  free_aligned_buffer_page_end(ref_u);
-  free_aligned_buffer_page_end(ref_v);
-}
-
-TEST_F(LibYUVConvertTest, BGRAToI420_Check) {
-  TestRGBToI420(BGRAToI420, BGRAToARGB, 16, 16, disable_cpu_flags_,
-                benchmark_cpu_info_);
-  TestRGBToI420(BGRAToI420, BGRAToARGB, 17, 17, disable_cpu_flags_,
-                benchmark_cpu_info_);
-  TestRGBToI420(BGRAToI420, BGRAToARGB, 1280, 720, disable_cpu_flags_,
-                benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVConvertTest, RGBAToI420_Check) {
-  TestRGBToI420(RGBAToI420, RGBAToARGB, 16, 16, disable_cpu_flags_,
-                benchmark_cpu_info_);
-  TestRGBToI420(RGBAToI420, RGBAToARGB, 17, 17, disable_cpu_flags_,
-                benchmark_cpu_info_);
-  TestRGBToI420(RGBAToI420, RGBAToARGB, 1280, 720, disable_cpu_flags_,
-                benchmark_cpu_info_);
-}
-
-TEST_F(LibYUVConvertTest, ABGRToI420_Check) {
-  TestRGBToI420(ABGRToI420, ABGRToARGB, 16, 16, disable_cpu_flags_,
-                benchmark_cpu_info_);
-  TestRGBToI420(ABGRToI420, ABGRToARGB, 17, 17, disable_cpu_flags_,
-                benchmark_cpu_info_);
-  TestRGBToI420(ABGRToI420, ABGRToARGB, 1280, 720, disable_cpu_flags_,
-                benchmark_cpu_info_);
-}
-
 #endif  // !defined(LEAN_TESTS)
 
 }  // namespace libyuv
diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc
index b24f3e250..a70666740 100644
--- a/unit_test/cpu_test.cc
+++ b/unit_test/cpu_test.cc
@@ -48,7 +48,7 @@ TEST_F(LibYUVBaseTest, TestCpuId) {
     printf("Cpu Vendor: %s 0x%x 0x%x 0x%x\n",
            reinterpret_cast<char*>(&cpu_info[0]), cpu_info[0], cpu_info[1],
            cpu_info[2]);
-    ASSERT_EQ(12u, strlen(reinterpret_cast<char*>(&cpu_info[0])));
+    EXPECT_EQ(12u, strlen(reinterpret_cast<char*>(&cpu_info[0])));
 
     // CPU Family and Model
     // 3:0 - Stepping
@@ -189,6 +189,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
     int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI);
     int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8);
     int has_amxint8 = TestCpuFlag(kCpuHasAMXINT8);
+    int has_avx512bmm = TestCpuFlag(kCpuHasAVX512BMM);
     printf("Has X86 0x%x\n", has_x86);
     printf("Has SSE2 0x%x\n", has_sse2);
     printf("Has SSSE3 0x%x\n", has_ssse3);
@@ -211,6 +212,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
     printf("HAS AVXVNNI 0x%x\n", has_avxvnni);
     printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
     printf("Has AMXINT8 0x%x\n", has_amxint8);
+    printf("Has AVX512BMM 0x%x\n", has_avx512bmm);
   }
 #endif  // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) ||
         // defined(_M_X64)
@@ -327,8 +329,8 @@ TEST_F(LibYUVBaseTest, DISABLED_TestLinuxArm) {
   if (FileExists("../../unit_test/testdata/arm_v7.txt")) {
     printf("Note: testing to load \"../../unit_test/testdata/arm_v7.txt\"\n");
 
-    ASSERT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt"));
-    ASSERT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt"));
+    EXPECT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt"));
+    EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt"));
   } else {
     printf("WARNING: unable to load \"../../unit_test/testdata/arm_v7.txt\"\n");
   }
@@ -347,23 +349,23 @@ TEST_F(LibYUVBaseTest, DISABLED_TestLinuxArm) {
 #if defined(__linux__) && defined(__aarch64__)
 TEST_F(LibYUVBaseTest, TestLinuxAArch64) {
   // Values taken from a Cortex-A57 machine, only Neon available.
-  ASSERT_EQ(kCpuHasNEON, AArch64CpuCaps(0xffU, 0x0U));
+  EXPECT_EQ(kCpuHasNEON, AArch64CpuCaps(0xffU, 0x0U));
 
   // Values taken from a Google Pixel 7.
   int expected = kCpuHasNEON | kCpuHasNeonDotProd;
-  ASSERT_EQ(expected, AArch64CpuCaps(0x119fffU, 0x0U));
+  EXPECT_EQ(expected, AArch64CpuCaps(0x119fffU, 0x0U));
 
   // Values taken from a Google Pixel 8.
   expected = kCpuHasNEON | kCpuHasNeonDotProd | kCpuHasNeonI8MM | kCpuHasSVE |
              kCpuHasSVE2;
-  ASSERT_EQ(expected, AArch64CpuCaps(0x3fffffffU, 0x2f33fU));
+  EXPECT_EQ(expected, AArch64CpuCaps(0x3fffffffU, 0x2f33fU));
 
   // Values taken from a Neoverse N2 machine.
-  ASSERT_EQ(expected, AArch64CpuCaps(0x3fffffffU, 0x2f3ffU));
+  EXPECT_EQ(expected, AArch64CpuCaps(0x3fffffffU, 0x2f3ffU));
 
   // Check for SME feature detection.
   expected |= kCpuHasSME;
-  ASSERT_EQ(expected, AArch64CpuCaps(0x3fffffffU, 0x82f3ffU));
+  EXPECT_EQ(expected, AArch64CpuCaps(0x3fffffffU, 0x82f3ffU));
 
   // TODO: Check for SME2 feature detection from Apple M4
 }
@@ -373,10 +375,10 @@ TEST_F(LibYUVBaseTest, DISABLED_TestLinuxRVV) {
   if (FileExists("../../unit_test/testdata/riscv64.txt")) {
     printf("Note: testing to load \"../../unit_test/testdata/riscv64.txt\"\n");
 
-    ASSERT_EQ(0, RiscvCpuCaps("../../unit_test/testdata/riscv64.txt"));
-    ASSERT_EQ(kCpuHasRVV,
+    EXPECT_EQ(0, RiscvCpuCaps("../../unit_test/testdata/riscv64.txt"));
+    EXPECT_EQ(kCpuHasRVV,
               RiscvCpuCaps("../../unit_test/testdata/riscv64_rvv.txt"));
-    ASSERT_EQ(kCpuHasRVV | kCpuHasRVVZVFH,
+    EXPECT_EQ(kCpuHasRVV | kCpuHasRVVZVFH,
               RiscvCpuCaps("../../unit_test/testdata/riscv64_rvv_zvfh.txt"));
   } else {
     printf(
@@ -410,15 +412,15 @@ TEST_F(LibYUVBaseTest, MAYBE_TestSetCpuFlags) {
   // Test setting different CPU configurations.
   int cpu_flags = kCpuHasARM | kCpuHasNEON | kCpuInitialized;
   SetCpuFlags(cpu_flags);
-  ASSERT_EQ(cpu_flags, TestCpuFlag(-1));
+  EXPECT_EQ(cpu_flags, TestCpuFlag(-1));
 
   cpu_flags = kCpuHasX86 | kCpuInitialized;
   SetCpuFlags(cpu_flags);
-  ASSERT_EQ(cpu_flags, TestCpuFlag(-1));
+  EXPECT_EQ(cpu_flags, TestCpuFlag(-1));
 
   // Test that setting 0 turns auto-init back on.
   SetCpuFlags(0);
-  ASSERT_EQ(original_cpu_flags, TestCpuFlag(-1));
+  EXPECT_EQ(original_cpu_flags, TestCpuFlag(-1));
 
   // Restore the CPU flag mask.
   MaskCpuFlags(benchmark_cpu_info_);
diff --git a/unit_test/cpu_thread_test.cc b/unit_test/cpu_thread_test.cc
index 572074d73..b6c0fa066 100644
--- a/unit_test/cpu_thread_test.cc
+++ b/unit_test/cpu_thread_test.cc
@@ -51,10 +51,10 @@ TEST(LibYUVCpuThreadTest, TestCpuFlagMultipleThreads) {
   ret = pthread_create(&thread2, nullptr, ThreadMain, &cpu_flags2);
   ASSERT_EQ(ret, 0);
   ret = pthread_join(thread1, nullptr);
-  ASSERT_EQ(ret, 0);
+  EXPECT_EQ(ret, 0);
   ret = pthread_join(thread2, nullptr);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(cpu_flags1, cpu_flags2);
+  EXPECT_EQ(ret, 0);
+  EXPECT_EQ(cpu_flags1, cpu_flags2);
 #else
   printf("pthread unavailable; Test skipped.");
 #endif  // LIBYUV_HAVE_PTHREAD
diff --git a/unit_test/math_test.cc b/unit_test/math_test.cc
index 4767f8b46..a1544c122 100644
--- a/unit_test/math_test.cc
+++ b/unit_test/math_test.cc
@@ -30,44 +30,44 @@ TEST_F(LibYUVBaseTest, TestFixedDiv) {
   int result_opt[1280];
   int result_c[1280];
 
-  ASSERT_EQ(0x10000, libyuv::FixedDiv(1, 1));
-  ASSERT_EQ(0x7fff0000, libyuv::FixedDiv(0x7fff, 1));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(1, 1));
+  EXPECT_EQ(0x7fff0000, libyuv::FixedDiv(0x7fff, 1));
   // TODO(fbarchard): Avoid the following that throw exceptions.
-  // ASSERT_EQ(0x100000000, libyuv::FixedDiv(0x10000, 1));
-  // ASSERT_EQ(0x80000000, libyuv::FixedDiv(0x8000, 1));
+  // EXPECT_EQ(0x100000000, libyuv::FixedDiv(0x10000, 1));
+  // EXPECT_EQ(0x80000000, libyuv::FixedDiv(0x8000, 1));
 
-  ASSERT_EQ(0x20000, libyuv::FixedDiv(640 * 2, 640));
-  ASSERT_EQ(0x30000, libyuv::FixedDiv(640 * 3, 640));
-  ASSERT_EQ(0x40000, libyuv::FixedDiv(640 * 4, 640));
-  ASSERT_EQ(0x50000, libyuv::FixedDiv(640 * 5, 640));
-  ASSERT_EQ(0x60000, libyuv::FixedDiv(640 * 6, 640));
-  ASSERT_EQ(0x70000, libyuv::FixedDiv(640 * 7, 640));
-  ASSERT_EQ(0x80000, libyuv::FixedDiv(640 * 8, 640));
-  ASSERT_EQ(0xa0000, libyuv::FixedDiv(640 * 10, 640));
-  ASSERT_EQ(0x20000, libyuv::FixedDiv(960 * 2, 960));
-  ASSERT_EQ(0x08000, libyuv::FixedDiv(640 / 2, 640));
-  ASSERT_EQ(0x04000, libyuv::FixedDiv(640 / 4, 640));
-  ASSERT_EQ(0x20000, libyuv::FixedDiv(1080 * 2, 1080));
-  ASSERT_EQ(0x20000, libyuv::FixedDiv(200000, 100000));
-  ASSERT_EQ(0x18000, libyuv::FixedDiv(150000, 100000));
-  ASSERT_EQ(0x20000, libyuv::FixedDiv(40000, 20000));
-  ASSERT_EQ(0x20000, libyuv::FixedDiv(-40000, -20000));
-  ASSERT_EQ(-0x20000, libyuv::FixedDiv(40000, -20000));
-  ASSERT_EQ(-0x20000, libyuv::FixedDiv(-40000, 20000));
-  ASSERT_EQ(0x10000, libyuv::FixedDiv(4095, 4095));
-  ASSERT_EQ(0x10000, libyuv::FixedDiv(4096, 4096));
-  ASSERT_EQ(0x10000, libyuv::FixedDiv(4097, 4097));
-  ASSERT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(640 * 2, 640));
+  EXPECT_EQ(0x30000, libyuv::FixedDiv(640 * 3, 640));
+  EXPECT_EQ(0x40000, libyuv::FixedDiv(640 * 4, 640));
+  EXPECT_EQ(0x50000, libyuv::FixedDiv(640 * 5, 640));
+  EXPECT_EQ(0x60000, libyuv::FixedDiv(640 * 6, 640));
+  EXPECT_EQ(0x70000, libyuv::FixedDiv(640 * 7, 640));
+  EXPECT_EQ(0x80000, libyuv::FixedDiv(640 * 8, 640));
+  EXPECT_EQ(0xa0000, libyuv::FixedDiv(640 * 10, 640));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(960 * 2, 960));
+  EXPECT_EQ(0x08000, libyuv::FixedDiv(640 / 2, 640));
+  EXPECT_EQ(0x04000, libyuv::FixedDiv(640 / 4, 640));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(1080 * 2, 1080));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(200000, 100000));
+  EXPECT_EQ(0x18000, libyuv::FixedDiv(150000, 100000));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(40000, 20000));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(-40000, -20000));
+  EXPECT_EQ(-0x20000, libyuv::FixedDiv(40000, -20000));
+  EXPECT_EQ(-0x20000, libyuv::FixedDiv(-40000, 20000));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4095, 4095));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4096, 4096));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4097, 4097));
+  EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
 
   for (int i = 1; i < 4100; ++i) {
-    ASSERT_EQ(0x10000, libyuv::FixedDiv(i, i));
-    ASSERT_EQ(0x20000, libyuv::FixedDiv(i * 2, i));
-    ASSERT_EQ(0x30000, libyuv::FixedDiv(i * 3, i));
-    ASSERT_EQ(0x40000, libyuv::FixedDiv(i * 4, i));
-    ASSERT_EQ(0x08000, libyuv::FixedDiv(i, i * 2));
-    ASSERT_NEAR(16384 * 65536 / i, libyuv::FixedDiv(16384, i), 1);
+    EXPECT_EQ(0x10000, libyuv::FixedDiv(i, i));
+    EXPECT_EQ(0x20000, libyuv::FixedDiv(i * 2, i));
+    EXPECT_EQ(0x30000, libyuv::FixedDiv(i * 3, i));
+    EXPECT_EQ(0x40000, libyuv::FixedDiv(i * 4, i));
+    EXPECT_EQ(0x08000, libyuv::FixedDiv(i, i * 2));
+    EXPECT_NEAR(16384 * 65536 / i, libyuv::FixedDiv(16384, i), 1);
   }
-  ASSERT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
+  EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
 
   MemRandomize(reinterpret_cast<uint8_t*>(&num[0]), sizeof(num));
   MemRandomize(reinterpret_cast<uint8_t*>(&div[0]), sizeof(div));
@@ -84,7 +84,7 @@ TEST_F(LibYUVBaseTest, TestFixedDiv) {
   }
   for (int j = 0; j < 1280; ++j) {
     result_c[j] = libyuv::FixedDiv_C(num[j], div[j]);
-    ASSERT_NEAR(result_c[j], result_opt[j], 1);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
   }
 }
 
@@ -118,7 +118,7 @@ TEST_F(LibYUVBaseTest, TestFixedDiv_Opt) {
   }
   for (int j = 0; j < 1280; ++j) {
     result_c[j] = libyuv::FixedDiv_C(num[j], div[j]);
-    ASSERT_NEAR(result_c[j], result_opt[j], 1);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
   }
 }
 
@@ -152,7 +152,7 @@ TEST_F(LibYUVBaseTest, TestFixedDiv1_Opt) {
   }
   for (int j = 0; j < 1280; ++j) {
     result_c[j] = libyuv::FixedDiv1_C(num[j], div[j]);
-    ASSERT_NEAR(result_c[j], result_opt[j], 1);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
   }
 }
 #endif  // ENABLE_ROW_TESTS
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 7eba494b7..2e26b4cf6 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -29,7 +29,11 @@
 #include "libyuv/row.h" /* For ScaleSumSamples_Neon */
 #endif
 
+#if defined(LIBYUV_BIT_EXACT)
 #define EXPECTED_UNATTENUATE_DIFF 0
+#else
+#define EXPECTED_UNATTENUATE_DIFF 2
+#endif
 
 namespace libyuv {
 
@@ -64,48 +68,48 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
   orig_pixels[4 * 4 + 3] = 255u;
 
   ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 5, 1);
-  ASSERT_EQ(255u, unatten_pixels[0 * 4 + 0]);
-  ASSERT_EQ(255u, unatten_pixels[0 * 4 + 1]);
-  ASSERT_EQ(254u, unatten_pixels[0 * 4 + 2]);
-  ASSERT_EQ(128u, unatten_pixels[0 * 4 + 3]);
-  ASSERT_EQ(0u, unatten_pixels[1 * 4 + 0]);
-  ASSERT_EQ(0u, unatten_pixels[1 * 4 + 1]);
-  ASSERT_EQ(0u, unatten_pixels[1 * 4 + 2]);
-  ASSERT_EQ(0u, unatten_pixels[1 * 4 + 3]);
-  ASSERT_EQ(32u, unatten_pixels[2 * 4 + 0]);
-  ASSERT_EQ(128u, unatten_pixels[2 * 4 + 1]);
-  ASSERT_EQ(255u, unatten_pixels[2 * 4 + 2]);
-  ASSERT_EQ(128u, unatten_pixels[2 * 4 + 3]);
-  ASSERT_EQ(16u, unatten_pixels[3 * 4 + 0]);
-  ASSERT_EQ(64u, unatten_pixels[3 * 4 + 1]);
-  ASSERT_EQ(192u, unatten_pixels[3 * 4 + 2]);
-  ASSERT_EQ(255u, unatten_pixels[3 * 4 + 3]);
-  ASSERT_EQ(255u, unatten_pixels[4 * 4 + 0]);
-  ASSERT_EQ(255u, unatten_pixels[4 * 4 + 1]);
-  ASSERT_EQ(255u, unatten_pixels[4 * 4 + 2]);
-  ASSERT_EQ(255u, unatten_pixels[4 * 4 + 3]);
+  EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]);
+  EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]);
+  EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]);
+  EXPECT_EQ(128u, unatten_pixels[0 * 4 + 3]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 0]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]);
+  EXPECT_EQ(32u, unatten_pixels[2 * 4 + 0]);
+  EXPECT_EQ(128u, unatten_pixels[2 * 4 + 1]);
+  EXPECT_EQ(255u, unatten_pixels[2 * 4 + 2]);
+  EXPECT_EQ(128u, unatten_pixels[2 * 4 + 3]);
+  EXPECT_EQ(16u, unatten_pixels[3 * 4 + 0]);
+  EXPECT_EQ(64u, unatten_pixels[3 * 4 + 1]);
+  EXPECT_EQ(192u, unatten_pixels[3 * 4 + 2]);
+  EXPECT_EQ(255u, unatten_pixels[3 * 4 + 3]);
+  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 0]);
+  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 1]);
+  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 2]);
+  EXPECT_EQ(255u, unatten_pixels[4 * 4 + 3]);
 
   ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 5, 1);
-  ASSERT_EQ(100u, atten_pixels[0 * 4 + 0]);
-  ASSERT_EQ(65u, atten_pixels[0 * 4 + 1]);
-  ASSERT_EQ(64u, atten_pixels[0 * 4 + 2]);
-  ASSERT_EQ(128u, atten_pixels[0 * 4 + 3]);
-  ASSERT_EQ(0u, atten_pixels[1 * 4 + 0]);
-  ASSERT_EQ(0u, atten_pixels[1 * 4 + 1]);
-  ASSERT_EQ(0u, atten_pixels[1 * 4 + 2]);
-  ASSERT_EQ(0u, atten_pixels[1 * 4 + 3]);
-  ASSERT_EQ(8u, atten_pixels[2 * 4 + 0]);
-  ASSERT_EQ(32u, atten_pixels[2 * 4 + 1]);
-  ASSERT_EQ(96u, atten_pixels[2 * 4 + 2]);
-  ASSERT_EQ(128u, atten_pixels[2 * 4 + 3]);
-  ASSERT_EQ(16u, atten_pixels[3 * 4 + 0]);
-  ASSERT_EQ(64u, atten_pixels[3 * 4 + 1]);
-  ASSERT_EQ(192u, atten_pixels[3 * 4 + 2]);
-  ASSERT_EQ(255u, atten_pixels[3 * 4 + 3]);
-  ASSERT_EQ(255u, atten_pixels[4 * 4 + 0]);
-  ASSERT_EQ(255u, atten_pixels[4 * 4 + 1]);
-  ASSERT_EQ(255u, atten_pixels[4 * 4 + 2]);
-  ASSERT_EQ(255u, atten_pixels[4 * 4 + 3]);
+  EXPECT_EQ(100u, atten_pixels[0 * 4 + 0]);
+  EXPECT_EQ(65u, atten_pixels[0 * 4 + 1]);
+  EXPECT_EQ(64u, atten_pixels[0 * 4 + 2]);
+  EXPECT_EQ(128u, atten_pixels[0 * 4 + 3]);
+  EXPECT_EQ(0u, atten_pixels[1 * 4 + 0]);
+  EXPECT_EQ(0u, atten_pixels[1 * 4 + 1]);
+  EXPECT_EQ(0u, atten_pixels[1 * 4 + 2]);
+  EXPECT_EQ(0u, atten_pixels[1 * 4 + 3]);
+  EXPECT_EQ(8u, atten_pixels[2 * 4 + 0]);
+  EXPECT_EQ(32u, atten_pixels[2 * 4 + 1]);
+  EXPECT_EQ(96u, atten_pixels[2 * 4 + 2]);
+  EXPECT_EQ(128u, atten_pixels[2 * 4 + 3]);
+  EXPECT_EQ(16u, atten_pixels[3 * 4 + 0]);
+  EXPECT_EQ(64u, atten_pixels[3 * 4 + 1]);
+  EXPECT_EQ(192u, atten_pixels[3 * 4 + 2]);
+  EXPECT_EQ(255u, atten_pixels[3 * 4 + 3]);
+  EXPECT_EQ(255u, atten_pixels[4 * 4 + 0]);
+  EXPECT_EQ(255u, atten_pixels[4 * 4 + 1]);
+  EXPECT_EQ(255u, atten_pixels[4 * 4 + 2]);
+  EXPECT_EQ(255u, atten_pixels[4 * 4 + 3]);
 
   // test 255
   for (int i = 0; i < 256; ++i) {
@@ -116,10 +120,10 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
   }
   ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 256, 1);
   for (int i = 0; i < 256; ++i) {
-    ASSERT_EQ(orig_pixels[i * 4 + 0], atten_pixels[i * 4 + 0]);
-    ASSERT_EQ(0, atten_pixels[i * 4 + 1]);
-    ASSERT_EQ(0, atten_pixels[i * 4 + 2]);
-    ASSERT_EQ(255, atten_pixels[i * 4 + 3]);
+    EXPECT_EQ(orig_pixels[i * 4 + 0], atten_pixels[i * 4 + 0]);
+    EXPECT_EQ(0, atten_pixels[i * 4 + 1]);
+    EXPECT_EQ(0, atten_pixels[i * 4 + 2]);
+    EXPECT_EQ(255, atten_pixels[i * 4 + 3]);
   }
 
   for (int i = 0; i < 1280; ++i) {
@@ -134,24 +138,24 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
     ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1);
   }
   for (int i = 0; i < 1280; ++i) {
-    ASSERT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 1);
-    ASSERT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 1);
-    ASSERT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 1);
-    ASSERT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 1);
+    EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 1);
+    EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 1);
+    EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 1);
+    EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 1);
   }
   // Make sure transparent, 50% and opaque are fully accurate.
-  ASSERT_EQ(0, atten_pixels[0 * 4 + 0]);
-  ASSERT_EQ(0, atten_pixels[0 * 4 + 1]);
-  ASSERT_EQ(0, atten_pixels[0 * 4 + 2]);
-  ASSERT_EQ(0, atten_pixels[0 * 4 + 3]);
-  ASSERT_EQ(64, atten_pixels[128 * 4 + 0]);
-  ASSERT_EQ(32, atten_pixels[128 * 4 + 1]);
-  ASSERT_EQ(21, atten_pixels[128 * 4 + 2]);
-  ASSERT_EQ(128, atten_pixels[128 * 4 + 3]);
-  ASSERT_EQ(255, atten_pixels[255 * 4 + 0]);
-  ASSERT_EQ(127, atten_pixels[255 * 4 + 1]);
-  ASSERT_EQ(85, atten_pixels[255 * 4 + 2]);
-  ASSERT_EQ(255, atten_pixels[255 * 4 + 3]);
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 0]);
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 1]);
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 2]);
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 3]);
+  EXPECT_EQ(64, atten_pixels[128 * 4 + 0]);
+  EXPECT_EQ(32, atten_pixels[128 * 4 + 1]);
+  EXPECT_EQ(21, atten_pixels[128 * 4 + 2]);
+  EXPECT_EQ(128, atten_pixels[128 * 4 + 3]);
+  EXPECT_EQ(255, atten_pixels[255 * 4 + 0]);
+  EXPECT_EQ(127, atten_pixels[255 * 4 + 1]);
+  EXPECT_EQ(85, atten_pixels[255 * 4 + 2]);
+  EXPECT_EQ(255, atten_pixels[255 * 4 + 3]);
 
   free_aligned_buffer_page_end(atten2_pixels);
   free_aligned_buffer_page_end(unatten_pixels);
@@ -207,28 +211,28 @@ TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) {
                                 benchmark_iterations_, disable_cpu_flags_,
                                 benchmark_cpu_info_, +1, 0);
 
-  ASSERT_EQ(max_diff, 0);
+  EXPECT_EQ(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) {
   int max_diff =
       TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
                      disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
-  ASSERT_EQ(max_diff, 0);
+  EXPECT_EQ(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) {
   int max_diff =
       TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
                      disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
-  ASSERT_EQ(max_diff, 0);
+  EXPECT_EQ(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) {
   int max_diff =
       TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_,
                      disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
-  ASSERT_EQ(max_diff, 0);
+  EXPECT_EQ(max_diff, 0);
 }
 
 static int TestUnattenuateI(int width,
@@ -280,28 +284,28 @@ TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) {
   int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 0);
-  ASSERT_EQ(max_diff, 0);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) {
   int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 1);
-  ASSERT_EQ(max_diff, 0);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) {
   int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, -1, 0);
-  ASSERT_EQ(max_diff, 0);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) {
   int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 0);
-  ASSERT_EQ(max_diff, 0);
+  EXPECT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF);
 }
 
 TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
@@ -322,10 +326,10 @@ TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
 
   for (int y = 0; y < 16; ++y) {
     for (int x = 0; x < 16; ++x) {
-      ASSERT_EQ((x + 1) * (y + 1), added_pixels[y][x][0]);
-      ASSERT_EQ((x + 1) * (y + 1) * 2, added_pixels[y][x][1]);
-      ASSERT_EQ((x + 1) * (y + 1) * 3, added_pixels[y][x][2]);
-      ASSERT_EQ((x + 1) * (y + 1) * 255, added_pixels[y][x][3]);
+      EXPECT_EQ((x + 1) * (y + 1), added_pixels[y][x][0]);
+      EXPECT_EQ((x + 1) * (y + 1) * 2, added_pixels[y][x][1]);
+      EXPECT_EQ((x + 1) * (y + 1) * 3, added_pixels[y][x][2]);
+      EXPECT_EQ((x + 1) * (y + 1) * 255, added_pixels[y][x][3]);
     }
   }
 }
@@ -367,30 +371,30 @@ TEST_F(LibYUVPlanarTest, TestARGBGray) {
   orig_pixels[5][3] = 224u;
   // Do 16 to test asm version.
   ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1);
-  ASSERT_NEAR(29u, orig_pixels[0][0], 1);
-  ASSERT_NEAR(29u, orig_pixels[0][1], 1);
-  ASSERT_NEAR(29u, orig_pixels[0][2], 1);
-  ASSERT_EQ(128u, orig_pixels[0][3]);
-  ASSERT_EQ(149u, orig_pixels[1][0]);
-  ASSERT_EQ(149u, orig_pixels[1][1]);
-  ASSERT_EQ(149u, orig_pixels[1][2]);
-  ASSERT_EQ(0u, orig_pixels[1][3]);
-  ASSERT_NEAR(77u, orig_pixels[2][0], 1);
-  ASSERT_NEAR(77u, orig_pixels[2][1], 1);
-  ASSERT_NEAR(77u, orig_pixels[2][2], 1);
-  ASSERT_EQ(255u, orig_pixels[2][3]);
-  ASSERT_EQ(0u, orig_pixels[3][0]);
-  ASSERT_EQ(0u, orig_pixels[3][1]);
-  ASSERT_EQ(0u, orig_pixels[3][2]);
-  ASSERT_EQ(255u, orig_pixels[3][3]);
-  ASSERT_EQ(255u, orig_pixels[4][0]);
-  ASSERT_EQ(255u, orig_pixels[4][1]);
-  ASSERT_EQ(255u, orig_pixels[4][2]);
-  ASSERT_EQ(255u, orig_pixels[4][3]);
-  ASSERT_NEAR(97u, orig_pixels[5][0], 1);
-  ASSERT_NEAR(97u, orig_pixels[5][1], 1);
-  ASSERT_NEAR(97u, orig_pixels[5][2], 1);
-  ASSERT_EQ(224u, orig_pixels[5][3]);
+  EXPECT_NEAR(29u, orig_pixels[0][0], 1);
+  EXPECT_NEAR(29u, orig_pixels[0][1], 1);
+  EXPECT_NEAR(29u, orig_pixels[0][2], 1);
+  EXPECT_EQ(128u, orig_pixels[0][3]);
+  EXPECT_EQ(149u, orig_pixels[1][0]);
+  EXPECT_EQ(149u, orig_pixels[1][1]);
+  EXPECT_EQ(149u, orig_pixels[1][2]);
+  EXPECT_EQ(0u, orig_pixels[1][3]);
+  EXPECT_NEAR(77u, orig_pixels[2][0], 1);
+  EXPECT_NEAR(77u, orig_pixels[2][1], 1);
+  EXPECT_NEAR(77u, orig_pixels[2][2], 1);
+  EXPECT_EQ(255u, orig_pixels[2][3]);
+  EXPECT_EQ(0u, orig_pixels[3][0]);
+  EXPECT_EQ(0u, orig_pixels[3][1]);
+  EXPECT_EQ(0u, orig_pixels[3][2]);
+  EXPECT_EQ(255u, orig_pixels[3][3]);
+  EXPECT_EQ(255u, orig_pixels[4][0]);
+  EXPECT_EQ(255u, orig_pixels[4][1]);
+  EXPECT_EQ(255u, orig_pixels[4][2]);
+  EXPECT_EQ(255u, orig_pixels[4][3]);
+  EXPECT_NEAR(97u, orig_pixels[5][0], 1);
+  EXPECT_NEAR(97u, orig_pixels[5][1], 1);
+  EXPECT_NEAR(97u, orig_pixels[5][2], 1);
+  EXPECT_EQ(224u, orig_pixels[5][3]);
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
@@ -439,30 +443,30 @@ TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
   orig_pixels[5][3] = 224u;
   // Do 16 to test asm version.
   ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1);
-  ASSERT_NEAR(30u, gray_pixels[0][0], 1);
-  ASSERT_NEAR(30u, gray_pixels[0][1], 1);
-  ASSERT_NEAR(30u, gray_pixels[0][2], 1);
-  ASSERT_NEAR(128u, gray_pixels[0][3], 1);
-  ASSERT_NEAR(149u, gray_pixels[1][0], 1);
-  ASSERT_NEAR(149u, gray_pixels[1][1], 1);
-  ASSERT_NEAR(149u, gray_pixels[1][2], 1);
-  ASSERT_NEAR(0u, gray_pixels[1][3], 1);
-  ASSERT_NEAR(76u, gray_pixels[2][0], 1);
-  ASSERT_NEAR(76u, gray_pixels[2][1], 1);
-  ASSERT_NEAR(76u, gray_pixels[2][2], 1);
-  ASSERT_NEAR(255u, gray_pixels[2][3], 1);
-  ASSERT_NEAR(0u, gray_pixels[3][0], 1);
-  ASSERT_NEAR(0u, gray_pixels[3][1], 1);
-  ASSERT_NEAR(0u, gray_pixels[3][2], 1);
-  ASSERT_NEAR(255u, gray_pixels[3][3], 1);
-  ASSERT_NEAR(255u, gray_pixels[4][0], 1);
-  ASSERT_NEAR(255u, gray_pixels[4][1], 1);
-  ASSERT_NEAR(255u, gray_pixels[4][2], 1);
-  ASSERT_NEAR(255u, gray_pixels[4][3], 1);
-  ASSERT_NEAR(96u, gray_pixels[5][0], 1);
-  ASSERT_NEAR(96u, gray_pixels[5][1], 1);
-  ASSERT_NEAR(96u, gray_pixels[5][2], 1);
-  ASSERT_NEAR(224u, gray_pixels[5][3], 1);
+  EXPECT_NEAR(30u, gray_pixels[0][0], 1);
+  EXPECT_NEAR(30u, gray_pixels[0][1], 1);
+  EXPECT_NEAR(30u, gray_pixels[0][2], 1);
+  EXPECT_NEAR(128u, gray_pixels[0][3], 1);
+  EXPECT_NEAR(149u, gray_pixels[1][0], 1);
+  EXPECT_NEAR(149u, gray_pixels[1][1], 1);
+  EXPECT_NEAR(149u, gray_pixels[1][2], 1);
+  EXPECT_NEAR(0u, gray_pixels[1][3], 1);
+  EXPECT_NEAR(76u, gray_pixels[2][0], 1);
+  EXPECT_NEAR(76u, gray_pixels[2][1], 1);
+  EXPECT_NEAR(76u, gray_pixels[2][2], 1);
+  EXPECT_NEAR(255u, gray_pixels[2][3], 1);
+  EXPECT_NEAR(0u, gray_pixels[3][0], 1);
+  EXPECT_NEAR(0u, gray_pixels[3][1], 1);
+  EXPECT_NEAR(0u, gray_pixels[3][2], 1);
+  EXPECT_NEAR(255u, gray_pixels[3][3], 1);
+  EXPECT_NEAR(255u, gray_pixels[4][0], 1);
+  EXPECT_NEAR(255u, gray_pixels[4][1], 1);
+  EXPECT_NEAR(255u, gray_pixels[4][2], 1);
+  EXPECT_NEAR(255u, gray_pixels[4][3], 1);
+  EXPECT_NEAR(96u, gray_pixels[5][0], 1);
+  EXPECT_NEAR(96u, gray_pixels[5][1], 1);
+  EXPECT_NEAR(96u, gray_pixels[5][2], 1);
+  EXPECT_NEAR(224u, gray_pixels[5][3], 1);
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
@@ -481,10 +485,10 @@ TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
   }
   ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1);
   for (int i = 0; i < 256; ++i) {
-    ASSERT_EQ(i, orig_pixels[i][0]);
-    ASSERT_EQ(i, orig_pixels[i][1]);
-    ASSERT_EQ(i, orig_pixels[i][2]);
-    ASSERT_EQ(i, orig_pixels[i][3]);
+    EXPECT_EQ(i, orig_pixels[i][0]);
+    EXPECT_EQ(i, orig_pixels[i][1]);
+    EXPECT_EQ(i, orig_pixels[i][2]);
+    EXPECT_EQ(i, orig_pixels[i][3]);
   }
 }
 
@@ -524,30 +528,30 @@ TEST_F(LibYUVPlanarTest, TestARGBSepia) {
   orig_pixels[5][3] = 224u;
   // Do 16 to test asm version.
   ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 16, 1);
-  ASSERT_EQ(33u, orig_pixels[0][0]);
-  ASSERT_EQ(43u, orig_pixels[0][1]);
-  ASSERT_EQ(47u, orig_pixels[0][2]);
-  ASSERT_EQ(128u, orig_pixels[0][3]);
-  ASSERT_EQ(135u, orig_pixels[1][0]);
-  ASSERT_EQ(175u, orig_pixels[1][1]);
-  ASSERT_EQ(195u, orig_pixels[1][2]);
-  ASSERT_EQ(0u, orig_pixels[1][3]);
-  ASSERT_EQ(69u, orig_pixels[2][0]);
-  ASSERT_EQ(89u, orig_pixels[2][1]);
-  ASSERT_EQ(99u, orig_pixels[2][2]);
-  ASSERT_EQ(255u, orig_pixels[2][3]);
-  ASSERT_EQ(0u, orig_pixels[3][0]);
-  ASSERT_EQ(0u, orig_pixels[3][1]);
-  ASSERT_EQ(0u, orig_pixels[3][2]);
-  ASSERT_EQ(255u, orig_pixels[3][3]);
-  ASSERT_EQ(239u, orig_pixels[4][0]);
-  ASSERT_EQ(255u, orig_pixels[4][1]);
-  ASSERT_EQ(255u, orig_pixels[4][2]);
-  ASSERT_EQ(255u, orig_pixels[4][3]);
-  ASSERT_EQ(88u, orig_pixels[5][0]);
-  ASSERT_EQ(114u, orig_pixels[5][1]);
-  ASSERT_EQ(127u, orig_pixels[5][2]);
-  ASSERT_EQ(224u, orig_pixels[5][3]);
+  EXPECT_EQ(33u, orig_pixels[0][0]);
+  EXPECT_EQ(43u, orig_pixels[0][1]);
+  EXPECT_EQ(47u, orig_pixels[0][2]);
+  EXPECT_EQ(128u, orig_pixels[0][3]);
+  EXPECT_EQ(135u, orig_pixels[1][0]);
+  EXPECT_EQ(175u, orig_pixels[1][1]);
+  EXPECT_EQ(195u, orig_pixels[1][2]);
+  EXPECT_EQ(0u, orig_pixels[1][3]);
+  EXPECT_EQ(69u, orig_pixels[2][0]);
+  EXPECT_EQ(89u, orig_pixels[2][1]);
+  EXPECT_EQ(99u, orig_pixels[2][2]);
+  EXPECT_EQ(255u, orig_pixels[2][3]);
+  EXPECT_EQ(0u, orig_pixels[3][0]);
+  EXPECT_EQ(0u, orig_pixels[3][1]);
+  EXPECT_EQ(0u, orig_pixels[3][2]);
+  EXPECT_EQ(255u, orig_pixels[3][3]);
+  EXPECT_EQ(239u, orig_pixels[4][0]);
+  EXPECT_EQ(255u, orig_pixels[4][1]);
+  EXPECT_EQ(255u, orig_pixels[4][2]);
+  EXPECT_EQ(255u, orig_pixels[4][3]);
+  EXPECT_EQ(88u, orig_pixels[5][0]);
+  EXPECT_EQ(114u, orig_pixels[5][1]);
+  EXPECT_EQ(127u, orig_pixels[5][2]);
+  EXPECT_EQ(224u, orig_pixels[5][3]);
 
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
@@ -595,22 +599,22 @@ TEST_F(LibYUVPlanarTest, TestARGBColorMatrix) {
   // Do 16 to test asm version.
   ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
                   &kRGBToSepia[0], 16, 1);
-  ASSERT_EQ(31u, dst_pixels_opt[0][0]);
-  ASSERT_EQ(43u, dst_pixels_opt[0][1]);
-  ASSERT_EQ(47u, dst_pixels_opt[0][2]);
-  ASSERT_EQ(128u, dst_pixels_opt[0][3]);
-  ASSERT_EQ(135u, dst_pixels_opt[1][0]);
-  ASSERT_EQ(175u, dst_pixels_opt[1][1]);
-  ASSERT_EQ(195u, dst_pixels_opt[1][2]);
-  ASSERT_EQ(0u, dst_pixels_opt[1][3]);
-  ASSERT_EQ(67u, dst_pixels_opt[2][0]);
-  ASSERT_EQ(87u, dst_pixels_opt[2][1]);
-  ASSERT_EQ(99u, dst_pixels_opt[2][2]);
-  ASSERT_EQ(255u, dst_pixels_opt[2][3]);
-  ASSERT_EQ(87u, dst_pixels_opt[3][0]);
-  ASSERT_EQ(112u, dst_pixels_opt[3][1]);
-  ASSERT_EQ(127u, dst_pixels_opt[3][2]);
-  ASSERT_EQ(224u, dst_pixels_opt[3][3]);
+  EXPECT_EQ(31u, dst_pixels_opt[0][0]);
+  EXPECT_EQ(43u, dst_pixels_opt[0][1]);
+  EXPECT_EQ(47u, dst_pixels_opt[0][2]);
+  EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+  EXPECT_EQ(135u, dst_pixels_opt[1][0]);
+  EXPECT_EQ(175u, dst_pixels_opt[1][1]);
+  EXPECT_EQ(195u, dst_pixels_opt[1][2]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+  EXPECT_EQ(67u, dst_pixels_opt[2][0]);
+  EXPECT_EQ(87u, dst_pixels_opt[2][1]);
+  EXPECT_EQ(99u, dst_pixels_opt[2][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+  EXPECT_EQ(87u, dst_pixels_opt[3][0]);
+  EXPECT_EQ(112u, dst_pixels_opt[3][1]);
+  EXPECT_EQ(127u, dst_pixels_opt[3][2]);
+  EXPECT_EQ(224u, dst_pixels_opt[3][3]);
 
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
@@ -629,10 +633,10 @@ TEST_F(LibYUVPlanarTest, TestARGBColorMatrix) {
   }
 
   for (int i = 0; i < 1280; ++i) {
-    ASSERT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
-    ASSERT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
-    ASSERT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
-    ASSERT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+    EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+    EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+    EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+    EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
   }
 }
 
@@ -668,22 +672,22 @@ TEST_F(LibYUVPlanarTest, TestRGBColorMatrix) {
   orig_pixels[3][3] = 224u;
   // Do 16 to test asm version.
   RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 16, 1);
-  ASSERT_EQ(31u, orig_pixels[0][0]);
-  ASSERT_EQ(43u, orig_pixels[0][1]);
-  ASSERT_EQ(47u, orig_pixels[0][2]);
-  ASSERT_EQ(128u, orig_pixels[0][3]);
-  ASSERT_EQ(135u, orig_pixels[1][0]);
-  ASSERT_EQ(175u, orig_pixels[1][1]);
-  ASSERT_EQ(195u, orig_pixels[1][2]);
-  ASSERT_EQ(0u, orig_pixels[1][3]);
-  ASSERT_EQ(67u, orig_pixels[2][0]);
-  ASSERT_EQ(87u, orig_pixels[2][1]);
-  ASSERT_EQ(99u, orig_pixels[2][2]);
-  ASSERT_EQ(255u, orig_pixels[2][3]);
-  ASSERT_EQ(87u, orig_pixels[3][0]);
-  ASSERT_EQ(112u, orig_pixels[3][1]);
-  ASSERT_EQ(127u, orig_pixels[3][2]);
-  ASSERT_EQ(224u, orig_pixels[3][3]);
+  EXPECT_EQ(31u, orig_pixels[0][0]);
+  EXPECT_EQ(43u, orig_pixels[0][1]);
+  EXPECT_EQ(47u, orig_pixels[0][2]);
+  EXPECT_EQ(128u, orig_pixels[0][3]);
+  EXPECT_EQ(135u, orig_pixels[1][0]);
+  EXPECT_EQ(175u, orig_pixels[1][1]);
+  EXPECT_EQ(195u, orig_pixels[1][2]);
+  EXPECT_EQ(0u, orig_pixels[1][3]);
+  EXPECT_EQ(67u, orig_pixels[2][0]);
+  EXPECT_EQ(87u, orig_pixels[2][1]);
+  EXPECT_EQ(99u, orig_pixels[2][2]);
+  EXPECT_EQ(255u, orig_pixels[2][3]);
+  EXPECT_EQ(87u, orig_pixels[3][0]);
+  EXPECT_EQ(112u, orig_pixels[3][1]);
+  EXPECT_EQ(127u, orig_pixels[3][2]);
+  EXPECT_EQ(224u, orig_pixels[3][3]);
 
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
@@ -723,22 +727,22 @@ TEST_F(LibYUVPlanarTest, TestARGBColorTable) {
   orig_pixels[3][3] = 3u;
   // Do 16 to test asm version.
   ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1);
-  ASSERT_EQ(1u, orig_pixels[0][0]);
-  ASSERT_EQ(2u, orig_pixels[0][1]);
-  ASSERT_EQ(3u, orig_pixels[0][2]);
-  ASSERT_EQ(4u, orig_pixels[0][3]);
-  ASSERT_EQ(5u, orig_pixels[1][0]);
-  ASSERT_EQ(6u, orig_pixels[1][1]);
-  ASSERT_EQ(7u, orig_pixels[1][2]);
-  ASSERT_EQ(8u, orig_pixels[1][3]);
-  ASSERT_EQ(9u, orig_pixels[2][0]);
-  ASSERT_EQ(10u, orig_pixels[2][1]);
-  ASSERT_EQ(11u, orig_pixels[2][2]);
-  ASSERT_EQ(12u, orig_pixels[2][3]);
-  ASSERT_EQ(1u, orig_pixels[3][0]);
-  ASSERT_EQ(6u, orig_pixels[3][1]);
-  ASSERT_EQ(11u, orig_pixels[3][2]);
-  ASSERT_EQ(16u, orig_pixels[3][3]);
+  EXPECT_EQ(1u, orig_pixels[0][0]);
+  EXPECT_EQ(2u, orig_pixels[0][1]);
+  EXPECT_EQ(3u, orig_pixels[0][2]);
+  EXPECT_EQ(4u, orig_pixels[0][3]);
+  EXPECT_EQ(5u, orig_pixels[1][0]);
+  EXPECT_EQ(6u, orig_pixels[1][1]);
+  EXPECT_EQ(7u, orig_pixels[1][2]);
+  EXPECT_EQ(8u, orig_pixels[1][3]);
+  EXPECT_EQ(9u, orig_pixels[2][0]);
+  EXPECT_EQ(10u, orig_pixels[2][1]);
+  EXPECT_EQ(11u, orig_pixels[2][2]);
+  EXPECT_EQ(12u, orig_pixels[2][3]);
+  EXPECT_EQ(1u, orig_pixels[3][0]);
+  EXPECT_EQ(6u, orig_pixels[3][1]);
+  EXPECT_EQ(11u, orig_pixels[3][2]);
+  EXPECT_EQ(16u, orig_pixels[3][3]);
 
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
@@ -779,22 +783,22 @@ TEST_F(LibYUVPlanarTest, TestRGBColorTable) {
   orig_pixels[3][3] = 3u;
   // Do 16 to test asm version.
   RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1);
-  ASSERT_EQ(1u, orig_pixels[0][0]);
-  ASSERT_EQ(2u, orig_pixels[0][1]);
-  ASSERT_EQ(3u, orig_pixels[0][2]);
-  ASSERT_EQ(0u, orig_pixels[0][3]);  // Alpha unchanged.
-  ASSERT_EQ(5u, orig_pixels[1][0]);
-  ASSERT_EQ(6u, orig_pixels[1][1]);
-  ASSERT_EQ(7u, orig_pixels[1][2]);
-  ASSERT_EQ(1u, orig_pixels[1][3]);  // Alpha unchanged.
-  ASSERT_EQ(9u, orig_pixels[2][0]);
-  ASSERT_EQ(10u, orig_pixels[2][1]);
-  ASSERT_EQ(11u, orig_pixels[2][2]);
-  ASSERT_EQ(2u, orig_pixels[2][3]);  // Alpha unchanged.
-  ASSERT_EQ(1u, orig_pixels[3][0]);
-  ASSERT_EQ(6u, orig_pixels[3][1]);
-  ASSERT_EQ(11u, orig_pixels[3][2]);
-  ASSERT_EQ(3u, orig_pixels[3][3]);  // Alpha unchanged.
+  EXPECT_EQ(1u, orig_pixels[0][0]);
+  EXPECT_EQ(2u, orig_pixels[0][1]);
+  EXPECT_EQ(3u, orig_pixels[0][2]);
+  EXPECT_EQ(0u, orig_pixels[0][3]);  // Alpha unchanged.
+  EXPECT_EQ(5u, orig_pixels[1][0]);
+  EXPECT_EQ(6u, orig_pixels[1][1]);
+  EXPECT_EQ(7u, orig_pixels[1][2]);
+  EXPECT_EQ(1u, orig_pixels[1][3]);  // Alpha unchanged.
+  EXPECT_EQ(9u, orig_pixels[2][0]);
+  EXPECT_EQ(10u, orig_pixels[2][1]);
+  EXPECT_EQ(11u, orig_pixels[2][2]);
+  EXPECT_EQ(2u, orig_pixels[2][3]);  // Alpha unchanged.
+  EXPECT_EQ(1u, orig_pixels[3][0]);
+  EXPECT_EQ(6u, orig_pixels[3][1]);
+  EXPECT_EQ(11u, orig_pixels[3][2]);
+  EXPECT_EQ(3u, orig_pixels[3][3]);  // Alpha unchanged.
 
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
@@ -820,10 +824,10 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
                1280, 1);
 
   for (int i = 0; i < 1280; ++i) {
-    ASSERT_EQ((i / 8 * 8 + 8 / 2) & 255, orig_pixels[i][0]);
-    ASSERT_EQ((i / 2 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][1]);
-    ASSERT_EQ((i / 3 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][2]);
-    ASSERT_EQ(i & 255, orig_pixels[i][3]);
+    EXPECT_EQ((i / 8 * 8 + 8 / 2) & 255, orig_pixels[i][0]);
+    EXPECT_EQ((i / 2 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][1]);
+    EXPECT_EQ((i / 3 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][2]);
+    EXPECT_EQ(i & 255, orig_pixels[i][3]);
   }
   for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
     ARGBQuantize(&orig_pixels[0][0], 0, (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0,
@@ -848,7 +852,7 @@ TEST_F(LibYUVPlanarTest, ARGBMirror_Opt) {
                benchmark_width_ * 4, benchmark_width_, benchmark_height_);
   }
   for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
   free_aligned_buffer_page_end(src_pixels);
   free_aligned_buffer_page_end(dst_pixels_opt);
@@ -871,7 +875,7 @@ TEST_F(LibYUVPlanarTest, MirrorPlane_Opt) {
                 benchmark_width_, benchmark_height_);
   }
   for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
   free_aligned_buffer_page_end(src_pixels);
   free_aligned_buffer_page_end(dst_pixels_opt);
@@ -895,7 +899,7 @@ TEST_F(LibYUVPlanarTest, MirrorUVPlane_Opt) {
                   benchmark_width_ * 2, benchmark_width_, benchmark_height_);
   }
   for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
   free_aligned_buffer_page_end(src_pixels);
   free_aligned_buffer_page_end(dst_pixels_opt);
@@ -925,34 +929,34 @@ TEST_F(LibYUVPlanarTest, TestShade) {
   orig_pixels[3][3] = 0u;
   // Do 8 pixels to allow opt version to be used.
   ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80ffffff);
-  ASSERT_EQ(10u, shade_pixels[0][0]);
-  ASSERT_EQ(20u, shade_pixels[0][1]);
-  ASSERT_EQ(40u, shade_pixels[0][2]);
-  ASSERT_EQ(40u, shade_pixels[0][3]);
-  ASSERT_EQ(0u, shade_pixels[1][0]);
-  ASSERT_EQ(0u, shade_pixels[1][1]);
-  ASSERT_EQ(0u, shade_pixels[1][2]);
-  ASSERT_EQ(128u, shade_pixels[1][3]);
-  ASSERT_EQ(0u, shade_pixels[2][0]);
-  ASSERT_EQ(0u, shade_pixels[2][1]);
-  ASSERT_EQ(0u, shade_pixels[2][2]);
-  ASSERT_EQ(0u, shade_pixels[2][3]);
-  ASSERT_EQ(0u, shade_pixels[3][0]);
-  ASSERT_EQ(0u, shade_pixels[3][1]);
-  ASSERT_EQ(0u, shade_pixels[3][2]);
-  ASSERT_EQ(0u, shade_pixels[3][3]);
+  EXPECT_EQ(10u, shade_pixels[0][0]);
+  EXPECT_EQ(20u, shade_pixels[0][1]);
+  EXPECT_EQ(40u, shade_pixels[0][2]);
+  EXPECT_EQ(40u, shade_pixels[0][3]);
+  EXPECT_EQ(0u, shade_pixels[1][0]);
+  EXPECT_EQ(0u, shade_pixels[1][1]);
+  EXPECT_EQ(0u, shade_pixels[1][2]);
+  EXPECT_EQ(128u, shade_pixels[1][3]);
+  EXPECT_EQ(0u, shade_pixels[2][0]);
+  EXPECT_EQ(0u, shade_pixels[2][1]);
+  EXPECT_EQ(0u, shade_pixels[2][2]);
+  EXPECT_EQ(0u, shade_pixels[2][3]);
+  EXPECT_EQ(0u, shade_pixels[3][0]);
+  EXPECT_EQ(0u, shade_pixels[3][1]);
+  EXPECT_EQ(0u, shade_pixels[3][2]);
+  EXPECT_EQ(0u, shade_pixels[3][3]);
 
   ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80808080);
-  ASSERT_EQ(5u, shade_pixels[0][0]);
-  ASSERT_EQ(10u, shade_pixels[0][1]);
-  ASSERT_EQ(20u, shade_pixels[0][2]);
-  ASSERT_EQ(40u, shade_pixels[0][3]);
+  EXPECT_EQ(5u, shade_pixels[0][0]);
+  EXPECT_EQ(10u, shade_pixels[0][1]);
+  EXPECT_EQ(20u, shade_pixels[0][2]);
+  EXPECT_EQ(40u, shade_pixels[0][3]);
 
   ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x10204080);
-  ASSERT_EQ(5u, shade_pixels[0][0]);
-  ASSERT_EQ(5u, shade_pixels[0][1]);
-  ASSERT_EQ(5u, shade_pixels[0][2]);
-  ASSERT_EQ(5u, shade_pixels[0][3]);
+  EXPECT_EQ(5u, shade_pixels[0][0]);
+  EXPECT_EQ(5u, shade_pixels[0][1]);
+  EXPECT_EQ(5u, shade_pixels[0][2]);
+  EXPECT_EQ(5u, shade_pixels[0][3]);
 
   for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
     ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 1280, 1,
@@ -1003,37 +1007,37 @@ TEST_F(LibYUVPlanarTest, TestARGBInterpolate) {
 
   ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
                   &interpolate_pixels[0][0], 0, 4, 1, 128);
-  ASSERT_EQ(8u, interpolate_pixels[0][0]);
-  ASSERT_EQ(16u, interpolate_pixels[0][1]);
-  ASSERT_EQ(32u, interpolate_pixels[0][2]);
-  ASSERT_EQ(64u, interpolate_pixels[0][3]);
-  ASSERT_EQ(0u, interpolate_pixels[1][0]);
-  ASSERT_EQ(0u, interpolate_pixels[1][1]);
-  ASSERT_EQ(0u, interpolate_pixels[1][2]);
-  ASSERT_EQ(128u, interpolate_pixels[1][3]);
-  ASSERT_EQ(0u, interpolate_pixels[2][0]);
-  ASSERT_EQ(0u, interpolate_pixels[2][1]);
-  ASSERT_EQ(0u, interpolate_pixels[2][2]);
-  ASSERT_EQ(0u, interpolate_pixels[2][3]);
-  ASSERT_EQ(128u, interpolate_pixels[3][0]);
-  ASSERT_EQ(128u, interpolate_pixels[3][1]);
-  ASSERT_EQ(128u, interpolate_pixels[3][2]);
-  ASSERT_EQ(128u, interpolate_pixels[3][3]);
+  EXPECT_EQ(8u, interpolate_pixels[0][0]);
+  EXPECT_EQ(16u, interpolate_pixels[0][1]);
+  EXPECT_EQ(32u, interpolate_pixels[0][2]);
+  EXPECT_EQ(64u, interpolate_pixels[0][3]);
+  EXPECT_EQ(0u, interpolate_pixels[1][0]);
+  EXPECT_EQ(0u, interpolate_pixels[1][1]);
+  EXPECT_EQ(0u, interpolate_pixels[1][2]);
+  EXPECT_EQ(128u, interpolate_pixels[1][3]);
+  EXPECT_EQ(0u, interpolate_pixels[2][0]);
+  EXPECT_EQ(0u, interpolate_pixels[2][1]);
+  EXPECT_EQ(0u, interpolate_pixels[2][2]);
+  EXPECT_EQ(0u, interpolate_pixels[2][3]);
+  EXPECT_EQ(128u, interpolate_pixels[3][0]);
+  EXPECT_EQ(128u, interpolate_pixels[3][1]);
+  EXPECT_EQ(128u, interpolate_pixels[3][2]);
+  EXPECT_EQ(128u, interpolate_pixels[3][3]);
 
   ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
                   &interpolate_pixels[0][0], 0, 4, 1, 0);
-  ASSERT_EQ(16u, interpolate_pixels[0][0]);
-  ASSERT_EQ(32u, interpolate_pixels[0][1]);
-  ASSERT_EQ(64u, interpolate_pixels[0][2]);
-  ASSERT_EQ(128u, interpolate_pixels[0][3]);
+  EXPECT_EQ(16u, interpolate_pixels[0][0]);
+  EXPECT_EQ(32u, interpolate_pixels[0][1]);
+  EXPECT_EQ(64u, interpolate_pixels[0][2]);
+  EXPECT_EQ(128u, interpolate_pixels[0][3]);
 
   ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
                   &interpolate_pixels[0][0], 0, 4, 1, 192);
 
-  ASSERT_EQ(4u, interpolate_pixels[0][0]);
-  ASSERT_EQ(8u, interpolate_pixels[0][1]);
-  ASSERT_EQ(16u, interpolate_pixels[0][2]);
-  ASSERT_EQ(32u, interpolate_pixels[0][3]);
+  EXPECT_EQ(4u, interpolate_pixels[0][0]);
+  EXPECT_EQ(8u, interpolate_pixels[0][1]);
+  EXPECT_EQ(16u, interpolate_pixels[0][2]);
+  EXPECT_EQ(32u, interpolate_pixels[0][3]);
 
   for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
     ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
@@ -1084,37 +1088,37 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
 
   InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
                    &interpolate_pixels[0], 0, 16, 1, 128);
-  ASSERT_EQ(8u, interpolate_pixels[0]);
-  ASSERT_EQ(16u, interpolate_pixels[1]);
-  ASSERT_EQ(32u, interpolate_pixels[2]);
-  ASSERT_EQ(64u, interpolate_pixels[3]);
-  ASSERT_EQ(0u, interpolate_pixels[4]);
-  ASSERT_EQ(0u, interpolate_pixels[5]);
-  ASSERT_EQ(0u, interpolate_pixels[6]);
-  ASSERT_EQ(128u, interpolate_pixels[7]);
-  ASSERT_EQ(0u, interpolate_pixels[8]);
-  ASSERT_EQ(0u, interpolate_pixels[9]);
-  ASSERT_EQ(0u, interpolate_pixels[10]);
-  ASSERT_EQ(0u, interpolate_pixels[11]);
-  ASSERT_EQ(128u, interpolate_pixels[12]);
-  ASSERT_EQ(128u, interpolate_pixels[13]);
-  ASSERT_EQ(128u, interpolate_pixels[14]);
-  ASSERT_EQ(128u, interpolate_pixels[15]);
+  EXPECT_EQ(8u, interpolate_pixels[0]);
+  EXPECT_EQ(16u, interpolate_pixels[1]);
+  EXPECT_EQ(32u, interpolate_pixels[2]);
+  EXPECT_EQ(64u, interpolate_pixels[3]);
+  EXPECT_EQ(0u, interpolate_pixels[4]);
+  EXPECT_EQ(0u, interpolate_pixels[5]);
+  EXPECT_EQ(0u, interpolate_pixels[6]);
+  EXPECT_EQ(128u, interpolate_pixels[7]);
+  EXPECT_EQ(0u, interpolate_pixels[8]);
+  EXPECT_EQ(0u, interpolate_pixels[9]);
+  EXPECT_EQ(0u, interpolate_pixels[10]);
+  EXPECT_EQ(0u, interpolate_pixels[11]);
+  EXPECT_EQ(128u, interpolate_pixels[12]);
+  EXPECT_EQ(128u, interpolate_pixels[13]);
+  EXPECT_EQ(128u, interpolate_pixels[14]);
+  EXPECT_EQ(128u, interpolate_pixels[15]);
 
   InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
                    &interpolate_pixels[0], 0, 16, 1, 0);
-  ASSERT_EQ(16u, interpolate_pixels[0]);
-  ASSERT_EQ(32u, interpolate_pixels[1]);
-  ASSERT_EQ(64u, interpolate_pixels[2]);
-  ASSERT_EQ(128u, interpolate_pixels[3]);
+  EXPECT_EQ(16u, interpolate_pixels[0]);
+  EXPECT_EQ(32u, interpolate_pixels[1]);
+  EXPECT_EQ(64u, interpolate_pixels[2]);
+  EXPECT_EQ(128u, interpolate_pixels[3]);
 
   InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
                    &interpolate_pixels[0], 0, 16, 1, 192);
 
-  ASSERT_EQ(4u, interpolate_pixels[0]);
-  ASSERT_EQ(8u, interpolate_pixels[1]);
-  ASSERT_EQ(16u, interpolate_pixels[2]);
-  ASSERT_EQ(32u, interpolate_pixels[3]);
+  EXPECT_EQ(4u, interpolate_pixels[0]);
+  EXPECT_EQ(8u, interpolate_pixels[1]);
+  EXPECT_EQ(16u, interpolate_pixels[2]);
+  EXPECT_EQ(32u, interpolate_pixels[3]);
 
   for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
     InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
@@ -1165,37 +1169,37 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane_16) {
 
   InterpolatePlane_16(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
                       &interpolate_pixels[0], 0, 16, 1, 128);
-  ASSERT_EQ(8u, interpolate_pixels[0]);
-  ASSERT_EQ(16u, interpolate_pixels[1]);
-  ASSERT_EQ(32u, interpolate_pixels[2]);
-  ASSERT_EQ(64u, interpolate_pixels[3]);
-  ASSERT_EQ(0u, interpolate_pixels[4]);
-  ASSERT_EQ(0u, interpolate_pixels[5]);
-  ASSERT_EQ(0u, interpolate_pixels[6]);
-  ASSERT_EQ(128u, interpolate_pixels[7]);
-  ASSERT_EQ(0u, interpolate_pixels[8]);
-  ASSERT_EQ(0u, interpolate_pixels[9]);
-  ASSERT_EQ(0u, interpolate_pixels[10]);
-  ASSERT_EQ(0u, interpolate_pixels[11]);
-  ASSERT_EQ(128u, interpolate_pixels[12]);
-  ASSERT_EQ(128u, interpolate_pixels[13]);
-  ASSERT_EQ(128u, interpolate_pixels[14]);
-  ASSERT_EQ(128u, interpolate_pixels[15]);
+  EXPECT_EQ(8u, interpolate_pixels[0]);
+  EXPECT_EQ(16u, interpolate_pixels[1]);
+  EXPECT_EQ(32u, interpolate_pixels[2]);
+  EXPECT_EQ(64u, interpolate_pixels[3]);
+  EXPECT_EQ(0u, interpolate_pixels[4]);
+  EXPECT_EQ(0u, interpolate_pixels[5]);
+  EXPECT_EQ(0u, interpolate_pixels[6]);
+  EXPECT_EQ(128u, interpolate_pixels[7]);
+  EXPECT_EQ(0u, interpolate_pixels[8]);
+  EXPECT_EQ(0u, interpolate_pixels[9]);
+  EXPECT_EQ(0u, interpolate_pixels[10]);
+  EXPECT_EQ(0u, interpolate_pixels[11]);
+  EXPECT_EQ(128u, interpolate_pixels[12]);
+  EXPECT_EQ(128u, interpolate_pixels[13]);
+  EXPECT_EQ(128u, interpolate_pixels[14]);
+  EXPECT_EQ(128u, interpolate_pixels[15]);
 
   InterpolatePlane_16(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
                       &interpolate_pixels[0], 0, 16, 1, 0);
-  ASSERT_EQ(16u, interpolate_pixels[0]);
-  ASSERT_EQ(32u, interpolate_pixels[1]);
-  ASSERT_EQ(64u, interpolate_pixels[2]);
-  ASSERT_EQ(128u, interpolate_pixels[3]);
+  EXPECT_EQ(16u, interpolate_pixels[0]);
+  EXPECT_EQ(32u, interpolate_pixels[1]);
+  EXPECT_EQ(64u, interpolate_pixels[2]);
+  EXPECT_EQ(128u, interpolate_pixels[3]);
 
   InterpolatePlane_16(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
                       &interpolate_pixels[0], 0, 16, 1, 192);
 
-  ASSERT_EQ(4u, interpolate_pixels[0]);
-  ASSERT_EQ(8u, interpolate_pixels[1]);
-  ASSERT_EQ(16u, interpolate_pixels[2]);
-  ASSERT_EQ(32u, interpolate_pixels[3]);
+  EXPECT_EQ(4u, interpolate_pixels[0]);
+  EXPECT_EQ(8u, interpolate_pixels[1]);
+  EXPECT_EQ(16u, interpolate_pixels[2]);
+  EXPECT_EQ(32u, interpolate_pixels[3]);
 
   for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
     InterpolatePlane_16(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
@@ -1212,10 +1216,10 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane_16) {
         (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                \
     const int kStrideB =                                                      \
         (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                \
-    align_buffer_page_end(src_argb_a, kStrideA * kHeight + OFF);              \
-    align_buffer_page_end(src_argb_b, kStrideA * kHeight + OFF);              \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight);                    \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight);                  \
+    align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF);               \
+    align_buffer_page_end(src_argb_b, kStrideA* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight);                     \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight);                   \
     for (int i = 0; i < kStrideA * kHeight; ++i) {                            \
       src_argb_a[i + OFF] = (fastrand() & 0xff);                              \
       src_argb_b[i + OFF] = (fastrand() & 0xff);                              \
@@ -1229,7 +1233,7 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane_16) {
                       dst_argb_opt, kStrideB, kWidth, NEG kHeight, TERP);     \
     }                                                                         \
     for (int i = 0; i < kStrideB * kHeight; ++i) {                            \
-      ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                              \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                              \
     }                                                                         \
     free_aligned_buffer_page_end(src_argb_a);                                 \
     free_aligned_buffer_page_end(src_argb_b);                                 \
@@ -1306,35 +1310,35 @@ TEST_F(LibYUVPlanarTest, ARGBBlend_Any) {
   int max_diff =
       TestBlend(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBBlend_Unaligned) {
   int max_diff =
       TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
                 disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 1);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBBlend_Invert) {
   int max_diff =
       TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
                 disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 1);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBBlend_Unattenuated) {
   int max_diff =
       TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 0);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) {
   int max_diff =
       TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 static void TestBlendPlane(int width,
@@ -1366,14 +1370,14 @@ static void TestBlendPlane(int width,
   BlendPlane(src_argb_a + off, width, src_argb_b + off, width,
              src_argb_alpha + off, width, dst_argb_opt + off, width, width, 1);
   for (int i = 0; i < width; ++i) {
-    ASSERT_EQ(src_argb_a[i + off], dst_argb_opt[i + off]);
+    EXPECT_EQ(src_argb_a[i + off], dst_argb_opt[i + off]);
   }
   // Test destination is maintained exactly if alpha is 0.
   memset(src_argb_alpha + off, 0, width);
   BlendPlane(src_argb_a + off, width, src_argb_b + off, width,
              src_argb_alpha + off, width, dst_argb_opt + off, width, width, 1);
   for (int i = 0; i < width; ++i) {
-    ASSERT_EQ(src_argb_b[i + off], dst_argb_opt[i + off]);
+    EXPECT_EQ(src_argb_b[i + off], dst_argb_opt[i + off]);
   }
   for (int i = 0; i < kStride * height; ++i) {
     src_argb_a[i + off] = (fastrand() & 0xff);
@@ -1392,7 +1396,7 @@ static void TestBlendPlane(int width,
                invert * height);
   }
   for (int i = 0; i < kStride * height; ++i) {
-    ASSERT_EQ(dst_argb_c[i + off], dst_argb_opt[i + off]);
+    EXPECT_EQ(dst_argb_c[i + off], dst_argb_opt[i + off]);
   }
   free_aligned_buffer_page_end(src_argb_a);
   free_aligned_buffer_page_end(src_argb_b);
@@ -1418,7 +1422,7 @@ TEST_F(LibYUVPlanarTest, BlendPlane_Invert) {
                  disable_cpu_flags_, benchmark_cpu_info_, -1, 1);
 }
 
-#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
 
 static void TestI420Blend(int width,
                           int height,
@@ -1473,11 +1477,11 @@ static void TestI420Blend(int width,
               width, invert * height);
   }
   for (int i = 0; i < width * height; ++i) {
-    ASSERT_EQ(dst_y_c[i + off], dst_y_opt[i + off]);
+    EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]);
   }
   for (int i = 0; i < kSizeUV; ++i) {
-    ASSERT_EQ(dst_u_c[i + off], dst_u_opt[i + off]);
-    ASSERT_EQ(dst_v_c[i + off], dst_v_opt[i + off]);
+    EXPECT_EQ(dst_u_c[i + off], dst_u_opt[i + off]);
+    EXPECT_EQ(dst_v_c[i + off], dst_v_opt[i + off]);
   }
   free_aligned_buffer_page_end(src_y0);
   free_aligned_buffer_page_end(src_u0);
@@ -1528,15 +1532,15 @@ TEST_F(LibYUVPlanarTest, TestAffine) {
 
   ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0], uv_step,
                   1280);
-  ASSERT_EQ(0u, interpolate_pixels_C[0][0]);
-  ASSERT_EQ(96u, interpolate_pixels_C[128][0]);
-  ASSERT_EQ(191u, interpolate_pixels_C[255][3]);
+  EXPECT_EQ(0u, interpolate_pixels_C[0][0]);
+  EXPECT_EQ(96u, interpolate_pixels_C[128][0]);
+  EXPECT_EQ(191u, interpolate_pixels_C[255][3]);
 
 #if defined(HAS_ARGBAFFINEROW_SSE2)
   SIMD_ALIGNED(uint8_t interpolate_pixels_Opt[1280][4]);
   ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
                      uv_step, 1280);
-  ASSERT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 1280 * 4));
+  EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 1280 * 4));
 
   int has_sse2 = TestCpuFlag(kCpuHasSSE2);
   if (has_sse2) {
@@ -1599,28 +1603,28 @@ TEST_F(LibYUVPlanarTest, CopyPlane_Any) {
   int max_diff = TestCopyPlane(benchmark_width_ + 1, benchmark_height_,
                                benchmark_iterations_, disable_cpu_flags_,
                                benchmark_cpu_info_, +1, 0);
-  ASSERT_LE(max_diff, 0);
+  EXPECT_LE(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, CopyPlane_Unaligned) {
   int max_diff =
       TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
                     disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
-  ASSERT_LE(max_diff, 0);
+  EXPECT_LE(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, CopyPlane_Invert) {
   int max_diff =
       TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
                     disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
-  ASSERT_LE(max_diff, 0);
+  EXPECT_LE(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, CopyPlane_Opt) {
   int max_diff =
       TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
                     disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
-  ASSERT_LE(max_diff, 0);
+  EXPECT_LE(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, TestCopyPlaneZero) {
@@ -1632,30 +1636,30 @@ TEST_F(LibYUVPlanarTest, TestCopyPlaneZero) {
   // Disable all optimizations.
   MaskCpuFlags(disable_cpu_flags_);
   CopyPlane(&src, 0, &dst, 0, 0, 0);
-  ASSERT_EQ(src, 42);
-  ASSERT_EQ(dst, 0);
+  EXPECT_EQ(src, 42);
+  EXPECT_EQ(dst, 0);
 
   CopyPlane(&src, 1, &dst, 1, 1, 0);
-  ASSERT_EQ(src, 42);
-  ASSERT_EQ(dst, 0);
+  EXPECT_EQ(src, 42);
+  EXPECT_EQ(dst, 0);
 
   CopyPlane(&src, 1, &dst, 1, 0, 1);
-  ASSERT_EQ(src, 42);
-  ASSERT_EQ(dst, 0);
+  EXPECT_EQ(src, 42);
+  EXPECT_EQ(dst, 0);
 
   // Enable optimizations.
   MaskCpuFlags(benchmark_cpu_info_);
   CopyPlane(&src, 0, &dst, 0, 0, 0);
-  ASSERT_EQ(src, 42);
-  ASSERT_EQ(dst, 0);
+  EXPECT_EQ(src, 42);
+  EXPECT_EQ(dst, 0);
 
   CopyPlane(&src, 1, &dst, 1, 1, 0);
-  ASSERT_EQ(src, 42);
-  ASSERT_EQ(dst, 0);
+  EXPECT_EQ(src, 42);
+  EXPECT_EQ(dst, 0);
 
   CopyPlane(&src, 1, &dst, 1, 0, 1);
-  ASSERT_EQ(src, 42);
-  ASSERT_EQ(dst, 0);
+  EXPECT_EQ(src, 42);
+  EXPECT_EQ(dst, 0);
 }
 
 TEST_F(LibYUVPlanarTest, TestDetilePlane) {
@@ -1689,7 +1693,7 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) {
   }
 
   for (i = 0; i < y_plane_size; ++i) {
-    ASSERT_EQ(dst_c[i], dst_opt[i]);
+    EXPECT_EQ(dst_c[i], dst_opt[i]);
   }
 
   free_aligned_buffer_page_end(tile_y);
@@ -1728,7 +1732,7 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane_16) {
   }
 
   for (i = 0; i < y_plane_size; ++i) {
-    ASSERT_EQ(dst_c[i], dst_opt[i]);
+    EXPECT_EQ(dst_c[i], dst_opt[i]);
   }
 
   free_aligned_buffer_page_end(tile_y);
@@ -1774,8 +1778,8 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) {
   }
 
   for (i = 0; i < uv_plane_size; ++i) {
-    ASSERT_EQ(dst_u_two_stage[i], dst_u_opt[i]);
-    ASSERT_EQ(dst_v_two_stage[i], dst_v_opt[i]);
+    EXPECT_EQ(dst_u_two_stage[i], dst_u_opt[i]);
+    EXPECT_EQ(dst_v_two_stage[i], dst_v_opt[i]);
   }
 
   free_aligned_buffer_page_end(tile_uv);
@@ -1823,8 +1827,8 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
   }
 
   for (i = 0; i < uv_plane_size; ++i) {
-    ASSERT_EQ(dst_u_c[i], dst_u_opt[i]);
-    ASSERT_EQ(dst_v_c[i], dst_v_opt[i]);
+    EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);
+    EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);
   }
 
   free_aligned_buffer_page_end(tile_uv);
@@ -1884,28 +1888,28 @@ TEST_F(LibYUVPlanarTest, ARGBMultiply_Any) {
   int max_diff = TestMultiply(benchmark_width_ + 1, benchmark_height_,
                               benchmark_iterations_, disable_cpu_flags_,
                               benchmark_cpu_info_, +1, 0);
-  ASSERT_LE(max_diff, 0);
+  EXPECT_LE(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBMultiply_Unaligned) {
   int max_diff =
       TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_,
                    disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
-  ASSERT_LE(max_diff, 0);
+  EXPECT_LE(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBMultiply_Invert) {
   int max_diff =
       TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_,
                    disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
-  ASSERT_LE(max_diff, 0);
+  EXPECT_LE(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBMultiply_Opt) {
   int max_diff =
       TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_,
                    disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
-  ASSERT_LE(max_diff, 0);
+  EXPECT_LE(max_diff, 0);
 }
 
 static int TestAdd(int width,
@@ -1958,28 +1962,28 @@ TEST_F(LibYUVPlanarTest, ARGBAdd_Any) {
   int max_diff =
       TestAdd(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
               disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBAdd_Unaligned) {
   int max_diff =
       TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_,
               disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBAdd_Invert) {
   int max_diff =
       TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_,
               disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBAdd_Opt) {
   int max_diff =
       TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_,
               disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 static int TestSubtract(int width,
@@ -2032,28 +2036,28 @@ TEST_F(LibYUVPlanarTest, ARGBSubtract_Any) {
   int max_diff = TestSubtract(benchmark_width_ + 1, benchmark_height_,
                               benchmark_iterations_, disable_cpu_flags_,
                               benchmark_cpu_info_, +1, 0);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSubtract_Unaligned) {
   int max_diff =
       TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_,
                    disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSubtract_Invert) {
   int max_diff =
       TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_,
                    disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSubtract_Opt) {
   int max_diff =
       TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_,
                    disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 static int TestSobel(int width,
@@ -2104,28 +2108,28 @@ TEST_F(LibYUVPlanarTest, ARGBSobel_Any) {
   int max_diff =
       TestSobel(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSobel_Unaligned) {
   int max_diff =
       TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_,
                 disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSobel_Invert) {
   int max_diff =
       TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_,
                 disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSobel_Opt) {
   int max_diff =
       TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_,
                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 static int TestSobelToPlane(int width,
@@ -2178,28 +2182,28 @@ TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Any) {
   int max_diff = TestSobelToPlane(benchmark_width_ + 1, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 0);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Unaligned) {
   int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 1);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Invert) {
   int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, -1, 0);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Opt) {
   int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
                                   benchmark_iterations_, disable_cpu_flags_,
                                   benchmark_cpu_info_, +1, 0);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 static int TestSobelXY(int width,
@@ -2250,28 +2254,28 @@ TEST_F(LibYUVPlanarTest, ARGBSobelXY_Any) {
   int max_diff = TestSobelXY(benchmark_width_ + 1, benchmark_height_,
                              benchmark_iterations_, disable_cpu_flags_,
                              benchmark_cpu_info_, +1, 0);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSobelXY_Unaligned) {
   int max_diff =
       TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_,
                   disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSobelXY_Invert) {
   int max_diff =
       TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_,
                   disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBSobelXY_Opt) {
   int max_diff =
       TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_,
                   disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 static int TestBlur(int width,
@@ -2334,28 +2338,28 @@ TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Any)) {
   int max_diff =
       TestBlur(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Unaligned)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSize);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Invert)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSize);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Opt)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 static const int kBlurSmallSize = 5;
@@ -2363,28 +2367,28 @@ TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Any)) {
   int max_diff =
       TestBlur(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Unaligned)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSmallSize);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Invert)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSmallSize);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Opt)) {
   int max_diff =
       TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_,
                disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize);
-  ASSERT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, DISABLED_ARM(TestARGBPolynomial)) {
@@ -2428,26 +2432,26 @@ TEST_F(LibYUVPlanarTest, DISABLED_ARM(TestARGBPolynomial)) {
   // Do 16 to test asm version.
   ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
                  &kWarmifyPolynomial[0], 16, 1);
-  ASSERT_EQ(235u, dst_pixels_opt[0][0]);
-  ASSERT_EQ(0u, dst_pixels_opt[0][1]);
-  ASSERT_EQ(0u, dst_pixels_opt[0][2]);
-  ASSERT_EQ(128u, dst_pixels_opt[0][3]);
-  ASSERT_EQ(0u, dst_pixels_opt[1][0]);
-  ASSERT_EQ(233u, dst_pixels_opt[1][1]);
-  ASSERT_EQ(0u, dst_pixels_opt[1][2]);
-  ASSERT_EQ(0u, dst_pixels_opt[1][3]);
-  ASSERT_EQ(0u, dst_pixels_opt[2][0]);
-  ASSERT_EQ(0u, dst_pixels_opt[2][1]);
-  ASSERT_EQ(241u, dst_pixels_opt[2][2]);
-  ASSERT_EQ(255u, dst_pixels_opt[2][3]);
-  ASSERT_EQ(235u, dst_pixels_opt[3][0]);
-  ASSERT_EQ(233u, dst_pixels_opt[3][1]);
-  ASSERT_EQ(241u, dst_pixels_opt[3][2]);
-  ASSERT_EQ(255u, dst_pixels_opt[3][3]);
-  ASSERT_EQ(10u, dst_pixels_opt[4][0]);
-  ASSERT_EQ(59u, dst_pixels_opt[4][1]);
-  ASSERT_EQ(188u, dst_pixels_opt[4][2]);
-  ASSERT_EQ(224u, dst_pixels_opt[4][3]);
+  EXPECT_EQ(235u, dst_pixels_opt[0][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][2]);
+  EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][0]);
+  EXPECT_EQ(233u, dst_pixels_opt[1][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][2]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][1]);
+  EXPECT_EQ(241u, dst_pixels_opt[2][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+  EXPECT_EQ(235u, dst_pixels_opt[3][0]);
+  EXPECT_EQ(233u, dst_pixels_opt[3][1]);
+  EXPECT_EQ(241u, dst_pixels_opt[3][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[3][3]);
+  EXPECT_EQ(10u, dst_pixels_opt[4][0]);
+  EXPECT_EQ(59u, dst_pixels_opt[4][1]);
+  EXPECT_EQ(188u, dst_pixels_opt[4][2]);
+  EXPECT_EQ(224u, dst_pixels_opt[4][3]);
 
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
@@ -2467,10 +2471,10 @@ TEST_F(LibYUVPlanarTest, DISABLED_ARM(TestARGBPolynomial)) {
   }
 
   for (int i = 0; i < 1280; ++i) {
-    ASSERT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
-    ASSERT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
-    ASSERT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
-    ASSERT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+    EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+    EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+    EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+    EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
   }
 }
 
@@ -2535,70 +2539,70 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_One) {
   int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
                                 benchmark_iterations_, disable_cpu_flags_,
                                 benchmark_cpu_info_, 1.0f, 65535, +1, 0);
-  ASSERT_LE(diff, 1);
+  EXPECT_LE(diff, 1);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) {
   int diff = TestHalfFloatPlane(
       benchmark_width_, benchmark_height_, benchmark_iterations_,
       disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_Opt) {
   int diff = TestHalfFloatPlane(
       benchmark_width_, benchmark_height_, benchmark_iterations_,
       disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
   int diff = TestHalfFloatPlane(
       benchmark_width_, benchmark_height_, benchmark_iterations_,
       disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
   int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
                                 benchmark_iterations_, disable_cpu_flags_,
                                 benchmark_cpu_info_, 1.0f / 511.0f, 511, +1, 0);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Any) {
   int diff = TestHalfFloatPlane(
       benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
       disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Unaligned) {
   int diff = TestHalfFloatPlane(
       benchmark_width_, benchmark_height_, benchmark_iterations_,
       disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Invert) {
   int diff = TestHalfFloatPlane(
       benchmark_width_, benchmark_height_, benchmark_iterations_,
       disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
   int diff = TestHalfFloatPlane(
       benchmark_width_, benchmark_height_, benchmark_iterations_,
       disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
   int diff = TestHalfFloatPlane(
       benchmark_width_, benchmark_height_, benchmark_iterations_,
       disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 #if defined(__arm__) && !defined(__SOFTFP__)
@@ -2631,7 +2635,7 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_flush_denormal) {
       benchmark_width_, benchmark_height_, benchmark_iterations_,
       disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
   DisableFlushDenormalToZero();
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_flush_denormal) {
@@ -2640,7 +2644,7 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_flush_denormal) {
       benchmark_width_, benchmark_height_, benchmark_iterations_,
       disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
   DisableFlushDenormalToZero();
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 #endif  // defined(__arm__) && !defined(__SOFTFP__)
 
@@ -2687,7 +2691,7 @@ TEST_F(LibYUVPlanarTest, TestByteToFloat) {
   float diff = TestByteToFloat(benchmark_width_, benchmark_height_,
                                benchmark_iterations_, disable_cpu_flags_,
                                benchmark_cpu_info_, 1.0f);
-  ASSERT_EQ(0.f, diff);
+  EXPECT_EQ(0.f, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
@@ -2725,22 +2729,22 @@ TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
   // Do 16 to test asm version.
   ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
                      &lumacolortable[0], 16, 1);
-  ASSERT_EQ(253u, dst_pixels_opt[0][0]);
-  ASSERT_EQ(0u, dst_pixels_opt[0][1]);
-  ASSERT_EQ(0u, dst_pixels_opt[0][2]);
-  ASSERT_EQ(128u, dst_pixels_opt[0][3]);
-  ASSERT_EQ(0u, dst_pixels_opt[1][0]);
-  ASSERT_EQ(253u, dst_pixels_opt[1][1]);
-  ASSERT_EQ(0u, dst_pixels_opt[1][2]);
-  ASSERT_EQ(0u, dst_pixels_opt[1][3]);
-  ASSERT_EQ(0u, dst_pixels_opt[2][0]);
-  ASSERT_EQ(0u, dst_pixels_opt[2][1]);
-  ASSERT_EQ(253u, dst_pixels_opt[2][2]);
-  ASSERT_EQ(255u, dst_pixels_opt[2][3]);
-  ASSERT_EQ(48u, dst_pixels_opt[3][0]);
-  ASSERT_EQ(192u, dst_pixels_opt[3][1]);
-  ASSERT_EQ(64u, dst_pixels_opt[3][2]);
-  ASSERT_EQ(224u, dst_pixels_opt[3][3]);
+  EXPECT_EQ(253u, dst_pixels_opt[0][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][2]);
+  EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][0]);
+  EXPECT_EQ(253u, dst_pixels_opt[1][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][2]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][1]);
+  EXPECT_EQ(253u, dst_pixels_opt[2][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+  EXPECT_EQ(48u, dst_pixels_opt[3][0]);
+  EXPECT_EQ(192u, dst_pixels_opt[3][1]);
+  EXPECT_EQ(64u, dst_pixels_opt[3][2]);
+  EXPECT_EQ(224u, dst_pixels_opt[3][3]);
 
   for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
@@ -2759,10 +2763,10 @@ TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
                        lumacolortable, 1280, 1);
   }
   for (int i = 0; i < 1280; ++i) {
-    ASSERT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
-    ASSERT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
-    ASSERT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
-    ASSERT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+    EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+    EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+    EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+    EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
   }
 
   free_aligned_buffer_page_end(lumacolortable);
@@ -2788,7 +2792,7 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyAlpha) {
                   benchmark_width_ * 4, benchmark_width_, benchmark_height_);
   }
   for (int i = 0; i < kSize; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
   free_aligned_buffer_page_end(dst_pixels_c);
@@ -2827,7 +2831,7 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
   printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6),
          static_cast<int>(opt_time * 1e6));
   for (int i = 0; i < kPixels; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
   free_aligned_buffer_page_end(dst_pixels_c);
@@ -2867,7 +2871,7 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
   printf("%8d us C - %8d us OPT\n", static_cast<int>(c_time * 1e6),
          static_cast<int>(opt_time * 1e6));
   for (int i = 0; i < kPixels * 4; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
   free_aligned_buffer_page_end(dst_pixels_c);
@@ -2928,56 +2932,56 @@ TEST_F(LibYUVPlanarTest, ARGBRect_Any) {
   int max_diff = TestARGBRect(benchmark_width_ + 1, benchmark_height_,
                               benchmark_iterations_, disable_cpu_flags_,
                               benchmark_cpu_info_, +1, 0, 4);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBRect_Unaligned) {
   int max_diff =
       TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
                    disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 4);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBRect_Invert) {
   int max_diff =
       TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
                    disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 4);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, ARGBRect_Opt) {
   int max_diff =
       TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
                    disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 4);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, SetPlane_Any) {
   int max_diff = TestARGBRect(benchmark_width_ + 1, benchmark_height_,
                               benchmark_iterations_, disable_cpu_flags_,
                               benchmark_cpu_info_, +1, 0, 1);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, SetPlane_Unaligned) {
   int max_diff =
       TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
                    disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 1);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, SetPlane_Invert) {
   int max_diff =
       TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
                    disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 1);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, SetPlane_Opt) {
   int max_diff =
       TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_,
                    disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
-  ASSERT_EQ(0, max_diff);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
@@ -3005,7 +3009,7 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
   }
 
   for (int i = 0; i < kPixels * 2; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels_u);
@@ -3041,7 +3045,7 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_16_Opt) {
   }
 
   for (int i = 0; i < kPixels * 2 * 2; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
   free_aligned_buffer_page_end(src_pixels_u);
   free_aligned_buffer_page_end(src_pixels_v);
@@ -3076,8 +3080,8 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
   }
 
   for (int i = 0; i < kPixels; ++i) {
-    ASSERT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
-    ASSERT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
+    EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
+    EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels);
@@ -3116,8 +3120,8 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) {
   }
 
   for (int i = 0; i < kPixels * 2; ++i) {
-    ASSERT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
-    ASSERT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
+    EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
+    EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
   }
   free_aligned_buffer_page_end(src_pixels);
   free_aligned_buffer_page_end(dst_pixels_u_c);
@@ -3148,7 +3152,7 @@ TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
   }
 
   for (int i = 0; i < kPixels * 2; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels);
@@ -3203,7 +3207,7 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
   }
 
   for (int i = 0; i < kPixels * 3; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels);
@@ -3263,7 +3267,7 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
                 benchmark_height_);
 
   for (int i = 0; i < kPixels * 3; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels);
@@ -3327,7 +3331,7 @@ TEST_F(LibYUVPlanarTest, MergeARGBPlane_Opt) {
   }
 
   for (int i = 0; i < kPixels * 4; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels);
@@ -3393,7 +3397,7 @@ TEST_F(LibYUVPlanarTest, SplitARGBPlane_Opt) {
                  benchmark_width_ * 4, benchmark_width_, benchmark_height_);
 
   for (int i = 0; i < kPixels * 4; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels);
@@ -3455,7 +3459,7 @@ TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) {
   }
 
   for (int i = 0; i < kPixels * 4; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels);
@@ -3515,7 +3519,7 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
                  benchmark_height_);
 
   for (int i = 0; i < kPixels * 4; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels);
@@ -3563,7 +3567,7 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
                   kWidth, NEG benchmark_height_, DEPTH);                    \
     }                                                                       \
     for (int i = 0; i < kPixels * 4; ++i) {                                 \
-      ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                        \
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                        \
     }                                                                       \
     free_aligned_buffer_page_end(src_memory_r);                             \
     free_aligned_buffer_page_end(src_memory_g);                             \
@@ -3604,7 +3608,7 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
                   NEG benchmark_height_, DEPTH);                            \
     }                                                                       \
     for (int i = 0; i < kPixels * 4; ++i) {                                 \
-      ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                        \
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                        \
     }                                                                       \
     free_aligned_buffer_page_end(src_memory_r);                             \
     free_aligned_buffer_page_end(src_memory_g);                             \
@@ -3663,7 +3667,7 @@ TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 16)
                   NEG benchmark_height_, DEPTH);                            \
     }                                                                       \
     for (int i = 0; i < kPixels * 4; ++i) {                                 \
-      ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                        \
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                        \
     }                                                                       \
     free_aligned_buffer_page_end(src_memory_r);                             \
     free_aligned_buffer_page_end(src_memory_g);                             \
@@ -3719,7 +3723,7 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
   }
 
   for (int i = 0; i < kPixels * 2 * 2; ++i) {
-    ASSERT_EQ(dst_pixels_uv_opt[i], dst_pixels_uv_c[i]);
+    EXPECT_EQ(dst_pixels_uv_opt[i], dst_pixels_uv_c[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels_u);
@@ -3760,7 +3764,7 @@ TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
   }
 
   for (int i = 0; i < kPixels * 2; ++i) {
-    ASSERT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels_y);
@@ -3792,7 +3796,7 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
   }
 
   for (int i = 0; i < kPixels; ++i) {
-    ASSERT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels_y);
@@ -3823,7 +3827,7 @@ TEST_F(LibYUVPlanarTest, Convert8To8Plane) {
   }
 
   for (int i = 0; i < kPixels; ++i) {
-    ASSERT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels_y);
@@ -3852,7 +3856,7 @@ TEST_F(LibYUVPlanarTest, YUY2ToY) {
   }
 
   for (int i = 0; i < kPixels; ++i) {
-    ASSERT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels_y);
@@ -3881,7 +3885,7 @@ TEST_F(LibYUVPlanarTest, UYVYToY) {
   }
 
   for (int i = 0; i < kPixels; ++i) {
-    ASSERT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels_y);
@@ -3927,7 +3931,7 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
   }
 
   for (int i = 0; i < kPixels; ++i) {
-    ASSERT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels_y);
@@ -3955,7 +3959,7 @@ TEST_F(LibYUVPlanarTest, UYVYToYRow_Opt) {
   }
 
   for (int i = 0; i < kPixels; ++i) {
-    ASSERT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels_y);
@@ -3991,7 +3995,7 @@ TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
   }
 
   for (int i = 0; i < kPixels * 2; ++i) {
-    ASSERT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels_y);
@@ -4034,7 +4038,7 @@ TEST_F(LibYUVPlanarTest, Convert8To16Row_Opt) {
   }
 
   for (int i = 0; i < kPixels * 2; ++i) {
-    ASSERT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels_y);
@@ -4102,13 +4106,13 @@ float TestScaleMaxSamples(int benchmark_width,
 TEST_F(LibYUVPlanarTest, TestScaleMaxSamples_C) {
   float diff = TestScaleMaxSamples(benchmark_width_, benchmark_height_,
                                    benchmark_iterations_, 1.2f, false);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestScaleMaxSamples_Opt) {
   float diff = TestScaleMaxSamples(benchmark_width_, benchmark_height_,
                                    benchmark_iterations_, 1.2f, true);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 float TestScaleSumSamples(int benchmark_width,
@@ -4181,13 +4185,13 @@ float TestScaleSumSamples(int benchmark_width,
 TEST_F(LibYUVPlanarTest, TestScaleSumSamples_C) {
   float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_,
                                    benchmark_iterations_, 1.2f, false);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestScaleSumSamples_Opt) {
   float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_,
                                    benchmark_iterations_, 1.2f, true);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 float TestScaleSamples(int benchmark_width,
@@ -4245,13 +4249,13 @@ float TestScaleSamples(int benchmark_width,
 TEST_F(LibYUVPlanarTest, TestScaleSamples_C) {
   float diff = TestScaleSamples(benchmark_width_, benchmark_height_,
                                 benchmark_iterations_, 1.2f, false);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) {
   float diff = TestScaleSamples(benchmark_width_, benchmark_height_,
                                 benchmark_iterations_, 1.2f, true);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 float TestCopySamples(int benchmark_width,
@@ -4305,13 +4309,13 @@ float TestCopySamples(int benchmark_width,
 TEST_F(LibYUVPlanarTest, TestCopySamples_C) {
   float diff = TestCopySamples(benchmark_width_, benchmark_height_,
                                benchmark_iterations_, false);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestCopySamples_Opt) {
   float diff = TestCopySamples(benchmark_width_, benchmark_height_,
                                benchmark_iterations_, true);
-  ASSERT_EQ(0, diff);
+  EXPECT_EQ(0, diff);
 }
 
 extern "C" void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width);
@@ -4345,12 +4349,12 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
   }
 
   for (int i = 0; i < 1280; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
-  ASSERT_EQ(dst_pixels_c[0],
+  EXPECT_EQ(dst_pixels_c[0],
             static_cast<uint16_t>(0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1));
-  ASSERT_EQ(dst_pixels_c[639], static_cast<uint16_t>(10256));
+  EXPECT_EQ(dst_pixels_c[639], static_cast<uint16_t>(10256));
 }
 
 extern "C" void GaussCol_NEON(const uint16_t* src0,
@@ -4405,7 +4409,7 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
   }
 
   for (int i = 0; i < 1280; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 }
 
@@ -4436,7 +4440,7 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_F32_Opt) {
   }
 
   for (int i = 0; i < 1280; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 }
 
@@ -4476,7 +4480,7 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) {
   }
 
   for (int i = 0; i < 1280; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
   free_aligned_buffer_page_end(orig_pixels_buf);
 }
@@ -4504,8 +4508,8 @@ TEST_F(LibYUVPlanarTest, SwapUVRow) {
     SwapUVRow(src_pixels_vu, dst_pixels_uv, kPixels);
   }
   for (int i = 0; i < kPixels; ++i) {
-    ASSERT_EQ(dst_pixels_uv[i * 2 + 0], src_pixels_vu[i * 2 + 1]);
-    ASSERT_EQ(dst_pixels_uv[i * 2 + 1], src_pixels_vu[i * 2 + 0]);
+    EXPECT_EQ(dst_pixels_uv[i * 2 + 0], src_pixels_vu[i * 2 + 1]);
+    EXPECT_EQ(dst_pixels_uv[i * 2 + 1], src_pixels_vu[i * 2 + 0]);
   }
 
   free_aligned_buffer_page_end(src_pixels_vu);
@@ -4537,7 +4541,7 @@ TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
                    benchmark_height_);
   }
   for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
-    ASSERT_NEAR(((float*)(dst_pixels_c))[i], ((float*)(dst_pixels_opt))[i], 1.f)
+    EXPECT_NEAR(((float*)(dst_pixels_c))[i], ((float*)(dst_pixels_opt))[i], 1.f)
         << i;
   }
 
@@ -4572,7 +4576,7 @@ TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) {
   }
 
   for (int i = 0; i < dst_width * 2 * dst_height; ++i) {
-    ASSERT_EQ(dst_pixels_uv_c[i], dst_pixels_uv_opt[i]);
+    EXPECT_EQ(dst_pixels_uv_c[i], dst_pixels_uv_opt[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels_u);
@@ -4601,10 +4605,10 @@ TEST_F(LibYUVPlanarTest, NV12Copy) {
   }
 
   for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
-    ASSERT_EQ(src_y[i], dst_y[i]);
+    EXPECT_EQ(src_y[i], dst_y[i]);
   }
   for (int i = 0; i < halfwidth * 2 * halfheight; ++i) {
-    ASSERT_EQ(src_uv[i], dst_uv[i]);
+    EXPECT_EQ(src_uv[i], dst_uv[i]);
   }
 
   free_aligned_buffer_page_end(src_y);
@@ -4633,10 +4637,10 @@ TEST_F(LibYUVPlanarTest, NV21Copy) {
   }
 
   for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
-    ASSERT_EQ(src_y[i], dst_y[i]);
+    EXPECT_EQ(src_y[i], dst_y[i]);
   }
   for (int i = 0; i < halfwidth * 2 * halfheight; ++i) {
-    ASSERT_EQ(src_vu[i], dst_vu[i]);
+    EXPECT_EQ(src_vu[i], dst_vu[i]);
   }
 
   free_aligned_buffer_page_end(src_y);
@@ -4676,7 +4680,7 @@ TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32) {
                             y_plane_size);
 
   for (i = 0; i < y_plane_size; ++i) {
-    ASSERT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]);
+    EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]);
   }
 
   free_aligned_buffer_page_end(orig_f);
@@ -4713,7 +4717,7 @@ TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32Column) {
                             y_plane_size);
 
   for (i = 0; i < y_plane_size; ++i) {
-    ASSERT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]);
+    EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]);
   }
 
   free_aligned_buffer_page_end(orig_f);
diff --git a/unit_test/rotate_argb_test.cc b/unit_test/rotate_argb_test.cc
index 701e57a01..4c7b0b250 100644
--- a/unit_test/rotate_argb_test.cc
+++ b/unit_test/rotate_argb_test.cc
@@ -75,7 +75,7 @@ static void TestRotateBpp(int src_width,
 
   // Rotation should be exact.
   for (int i = 0; i < dst_argb_plane_size; ++i) {
-    ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);
+    EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);
   }
 
   free_aligned_buffer_page_end(dst_argb_c);
@@ -189,35 +189,35 @@ TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) {
   align_buffer_page_end(src_argb, argb_plane_size);
   align_buffer_page_end(dst_argb, argb_plane_size);
 
-  ASSERT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
                           benchmark_width_ * 4, benchmark_width_,
                           benchmark_height_, kRotate0));
 
-  ASSERT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
                           benchmark_width_ * 4 - 1, benchmark_width_ - 1,
                           benchmark_height_, kRotate0));
 
-  ASSERT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
                           benchmark_width_ * 4, benchmark_width_,
                           benchmark_height_, kRotate180));
 
-  ASSERT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
                           benchmark_width_ * 4 - 1, benchmark_width_ - 1,
                           benchmark_height_, kRotate180));
 
-  ASSERT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
                           abs(benchmark_height_) * 4, benchmark_width_,
                           benchmark_height_, kRotate90));
 
-  ASSERT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+  EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
                            abs(benchmark_height_) * 4, benchmark_width_ - 1,
                            benchmark_height_, kRotate90));
 
-  ASSERT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
                           abs(benchmark_height_) * 4, benchmark_width_,
                           benchmark_height_, kRotate270));
 
-  ASSERT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+  EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
                            abs(benchmark_height_) * 4, benchmark_width_ - 1,
                            benchmark_height_, kRotate270));
 
@@ -271,7 +271,7 @@ static void TestRotatePlane_16(int src_width,
 
   // Rotation should be exact.
   for (int i = 0; i < dst_plane_size; ++i) {
-    ASSERT_EQ(dst_c[i], dst_opt[i]);
+    EXPECT_EQ(dst_c[i], dst_opt[i]);
   }
 
   free_aligned_buffer_page_end_16(dst_c);
diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc
index 10ee64cbc..abc08efa8 100644
--- a/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
@@ -20,7 +20,7 @@
 
 namespace libyuv {
 
-#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
 
 static void I420TestRotate(int src_width,
                            int src_height,
@@ -78,7 +78,7 @@ static void I420TestRotate(int src_width,
 
   // Rotation should be exact.
   for (int i = 0; i < dst_i420_size; ++i) {
-    ASSERT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+    EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
   }
 
   free_aligned_buffer_page_end(dst_i420_c);
@@ -197,7 +197,7 @@ static void I422TestRotate(int src_width,
 
   // Rotation should be exact.
   for (int i = 0; i < dst_i422_size; ++i) {
-    ASSERT_EQ(dst_i422_c[i], dst_i422_opt[i]);
+    EXPECT_EQ(dst_i422_c[i], dst_i422_opt[i]);
   }
 
   free_aligned_buffer_page_end(dst_i422_c);
@@ -283,7 +283,7 @@ static void I444TestRotate(int src_width,
 
   // Rotation should be exact.
   for (int i = 0; i < dst_i444_size; ++i) {
-    ASSERT_EQ(dst_i444_c[i], dst_i444_opt[i]);
+    EXPECT_EQ(dst_i444_c[i], dst_i444_opt[i]);
   }
 
   free_aligned_buffer_page_end(dst_i444_c);
@@ -401,7 +401,7 @@ static void NV12TestRotate(int src_width,
 
   // Rotation should be exact.
   for (int i = 0; i < dst_i420_size; ++i) {
-    ASSERT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+    EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
   }
 
   free_aligned_buffer_page_end(dst_i420_c);
@@ -495,15 +495,15 @@ TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
     const int kHeight = benchmark_height_;                                    \
     const int kSizeUV =                                                       \
         SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
     align_buffer_page_end(src_uv,                                             \
-                          kSizeUV * ((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);     \
-    align_buffer_page_end(dst_y_c, kWidth * kHeight);                         \
+                          kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);       \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
     align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
                                        SUBSAMPLE(kHeight, SUBSAMP_Y));        \
     align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
                                        SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                       \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
     align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
                                          SUBSAMPLE(kHeight, SUBSAMP_Y));      \
     align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
@@ -522,12 +522,12 @@ TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
             (fastrand() & 0xff);                                              \
       }                                                                       \
     }                                                                         \
-    memset(dst_y_c, 1, kWidth * kHeight);                                     \
+    memset(dst_y_c, 1, kWidth* kHeight);                                      \
     memset(dst_u_c, 2,                                                        \
            SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
     memset(dst_v_c, 3,                                                        \
            SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    memset(dst_y_opt, 101, kWidth * kHeight);                                 \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
     memset(dst_u_opt, 102,                                                    \
            SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
     memset(dst_v_opt, 103,                                                    \
@@ -550,18 +550,18 @@ TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
     }                                                                         \
     for (int i = 0; i < kHeight; ++i) {                                       \
       for (int j = 0; j < kWidth; ++j) {                                      \
-        ASSERT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
       }                                                                       \
     }                                                                         \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
       for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
-        ASSERT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
+        EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
                   dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
       }                                                                       \
     }                                                                         \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
       for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
-        ASSERT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
+        EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
                   dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
       }                                                                       \
     }                                                                         \
@@ -656,7 +656,7 @@ static void I010TestRotate(int src_width,
 
   // Rotation should be exact.
   for (int i = 0; i < dst_i010_size; ++i) {
-    ASSERT_EQ(dst_i010_c[i], dst_i010_opt[i]);
+    EXPECT_EQ(dst_i010_c[i], dst_i010_opt[i]);
   }
 
   free_aligned_buffer_page_end_16(dst_i010_c);
@@ -744,7 +744,7 @@ static void I210TestRotate(int src_width,
 
   // Rotation should be exact.
   for (int i = 0; i < dst_i210_size; ++i) {
-    ASSERT_EQ(dst_i210_c[i], dst_i210_opt[i]);
+    EXPECT_EQ(dst_i210_c[i], dst_i210_opt[i]);
   }
 
   free_aligned_buffer_page_end_16(dst_i210_c);
@@ -830,7 +830,7 @@ static void I410TestRotate(int src_width,
 
   // Rotation should be exact.
   for (int i = 0; i < dst_i410_size; ++i) {
-    ASSERT_EQ(dst_i410_c[i], dst_i410_opt[i]);
+    EXPECT_EQ(dst_i410_c[i], dst_i410_opt[i]);
   }
 
   free_aligned_buffer_page_end_16(dst_i410_c);
@@ -906,8 +906,8 @@ TEST_F(LibYUVRotateTest, Transpose4x4_Test) {
 
   for (int i = 0; i < 4; ++i) {
     for (int j = 0; j < 4; ++j) {
-      ASSERT_EQ(dst_pixels_c[i][j], src_pixels[j][i]);
-      ASSERT_EQ(dst_pixels_c[i][j], dst_pixels_opt[i][j]);
+      EXPECT_EQ(dst_pixels_c[i][j], src_pixels[j][i]);
+      EXPECT_EQ(dst_pixels_c[i][j], dst_pixels_opt[i][j]);
     }
   }
 }
@@ -949,7 +949,7 @@ TEST_F(LibYUVRotateTest, Transpose4x4_Opt) {
   }
 
   for (int i = 0; i < width * height; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
   free_aligned_buffer_page_end(src_pixels);
diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc
index 3d3e36fc5..66fd4cf31 100644
--- a/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@@ -245,14 +245,14 @@ static int ARGBClipTestFilter(int src_width,
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
         kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
         benchmark_cpu_info_);                                                \
-    ASSERT_LE(diff, max_diff);                                               \
+    EXPECT_LE(diff, max_diff);                                               \
   }                                                                          \
   TEST_F(LibYUVScaleTest, DISABLED_##ARGBScaleDownClipBy##name##_##filter) { \
     int diff = ARGBClipTestFilter(                                           \
         SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
         kFilter##filter, benchmark_iterations_);                             \
-    ASSERT_LE(diff, max_diff);                                               \
+    EXPECT_LE(diff, max_diff);                                               \
   }
 
 // Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
@@ -294,28 +294,28 @@ TEST_FACTOR(3, 1, 3)
     int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, width,      \
                               height, kFilter##filter, benchmark_iterations_,  \
                               disable_cpu_flags_, benchmark_cpu_info_);        \
-    ASSERT_LE(diff, max_diff);                                                 \
+    EXPECT_LE(diff, max_diff);                                                 \
   }                                                                            \
   TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {           \
     int diff = ARGBTestFilter(width, height, Abs(benchmark_width_),            \
                               Abs(benchmark_height_), kFilter##filter,         \
                               benchmark_iterations_, disable_cpu_flags_,       \
                               benchmark_cpu_info_);                            \
-    ASSERT_LE(diff, max_diff);                                                 \
+    EXPECT_LE(diff, max_diff);                                                 \
   }                                                                            \
   TEST_F(LibYUVScaleTest,                                                      \
          DISABLED_##name##ClipTo##width##x##height##_##filter) {               \
     int diff =                                                                 \
         ARGBClipTestFilter(benchmark_width_, benchmark_height_, width, height, \
                            kFilter##filter, benchmark_iterations_);            \
-    ASSERT_LE(diff, max_diff);                                                 \
+    EXPECT_LE(diff, max_diff);                                                 \
   }                                                                            \
   TEST_F(LibYUVScaleTest,                                                      \
          DISABLED_##name##ClipFrom##width##x##height##_##filter) {             \
     int diff = ARGBClipTestFilter(width, height, Abs(benchmark_width_),        \
                                   Abs(benchmark_height_), kFilter##filter,     \
                                   benchmark_iterations_);                      \
-    ASSERT_LE(diff, max_diff);                                                 \
+    EXPECT_LE(diff, max_diff);                                                 \
   }
 
 #ifndef DISABLE_SLOW_TESTS
@@ -357,7 +357,7 @@ TEST_SCALETO(ARGBScale, 1920, 1080)
                               benchmark_height_, benchmark_width_,      \
                               kFilter##filter, benchmark_iterations_,   \
                               disable_cpu_flags_, benchmark_cpu_info_); \
-    ASSERT_LE(diff, max_diff);                                          \
+    EXPECT_LE(diff, max_diff);                                          \
   }
 
 #if defined(ENABLE_FULL_TESTS)
@@ -430,14 +430,12 @@ static void FillRamp(uint8_t* buf,
 }
 
 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
-static void YUVToARGBTestFilter(int src_width,
-                                int src_height,
-                                int dst_width,
-                                int dst_height,
-                                FilterMode f,
-                                int benchmark_iterations,
-                                int error_threshold,
-                                int* max_diff_out) {
+static int YUVToARGBTestFilter(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               FilterMode f,
+                               int benchmark_iterations) {
   int64_t src_y_plane_size = Abs(src_width) * Abs(src_height);
   int64_t src_uv_plane_size =
       ((Abs(src_width) + 1) / 2) * ((Abs(src_height) + 1) / 2);
@@ -448,13 +446,13 @@ static void YUVToARGBTestFilter(int src_width,
   align_buffer_page_end(src_u, src_uv_plane_size);
   align_buffer_page_end(src_v, src_uv_plane_size);
 
-  int64_t dst_argb_plane_size = (dst_width) * (dst_height) * 4LL;
-  int dst_stride_argb = (dst_width) * 4;
+  int64_t dst_argb_plane_size = (dst_width) * (dst_height)*4LL;
+  int dst_stride_argb = (dst_width)*4;
   align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
   align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
   if (!dst_argb_c || !dst_argb_opt || !src_y || !src_u || !src_v) {
     printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
-    return;
+    return 0;
   }
   // Fill YUV image with continuous ramp, which is less sensitive to
   // subsampling and filtering differences for test purposes.
@@ -483,44 +481,36 @@ static void YUVToARGBTestFilter(int src_width,
       int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
                          dst_argb_opt[(i * dst_stride_argb) + j]);
       if (abs_diff > max_diff) {
-        max_diff = abs_diff;
-      }
-      if (abs_diff > error_threshold) {
-        printf("error %d at %d,%d c %d opt %d\n", abs_diff, j, i,
+        printf("error %d at %d,%d c %d opt %d", abs_diff, j, i,
                dst_argb_c[(i * dst_stride_argb) + j],
                dst_argb_opt[(i * dst_stride_argb) + j]);
-        goto cleanup;
+        EXPECT_LE(abs_diff, 40);
+        max_diff = abs_diff;
       }
     }
   }
 
-cleanup:
-  if (max_diff_out) {
-    *max_diff_out = max_diff;
-  }
   free_aligned_buffer_page_end(dst_argb_c);
   free_aligned_buffer_page_end(dst_argb_opt);
   free_aligned_buffer_page_end(src_y);
   free_aligned_buffer_page_end(src_u);
   free_aligned_buffer_page_end(src_v);
+  return max_diff;
 }
 
 TEST_F(LibYUVScaleTest, YUVToRGBScaleUp) {
-  int diff = 0;
-  YUVToARGBTestFilter(benchmark_width_, benchmark_height_,
-                      benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2,
-                      libyuv::kFilterBilinear, benchmark_iterations_, 10,
-                      &diff);
-  ASSERT_LE(diff, 10);
+  int diff =
+      YUVToARGBTestFilter(benchmark_width_, benchmark_height_,
+                          benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2,
+                          libyuv::kFilterBilinear, benchmark_iterations_);
+  EXPECT_LE(diff, 10);
 }
 
 TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
-  int diff = 0;
-  YUVToARGBTestFilter(benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2,
-                      benchmark_width_, benchmark_height_,
-                      libyuv::kFilterBilinear, benchmark_iterations_, 10,
-                      &diff);
-  ASSERT_LE(diff, 10);
+  int diff = YUVToARGBTestFilter(
+      benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2, benchmark_width_,
+      benchmark_height_, libyuv::kFilterBilinear, benchmark_iterations_);
+  EXPECT_LE(diff, 10);
 }
 
 TEST_F(LibYUVScaleTest, ARGBTest3x) {
@@ -543,18 +533,18 @@ TEST_F(LibYUVScaleTest, ARGBTest3x) {
               kFilterBilinear);
   }
 
-  ASSERT_EQ(225, dest_pixels[0]);
-  ASSERT_EQ(255 - 225, dest_pixels[1]);
-  ASSERT_EQ(226, dest_pixels[2]);
-  ASSERT_EQ(235, dest_pixels[3]);
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);
+  EXPECT_EQ(226, dest_pixels[2]);
+  EXPECT_EQ(235, dest_pixels[3]);
 
   ARGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
             kFilterNone);
 
-  ASSERT_EQ(225, dest_pixels[0]);
-  ASSERT_EQ(255 - 225, dest_pixels[1]);
-  ASSERT_EQ(226, dest_pixels[2]);
-  ASSERT_EQ(235, dest_pixels[3]);
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);
+  EXPECT_EQ(226, dest_pixels[2]);
+  EXPECT_EQ(235, dest_pixels[3]);
 
   free_aligned_buffer_page_end(dest_pixels);
   free_aligned_buffer_page_end(orig_pixels);
@@ -580,18 +570,18 @@ TEST_F(LibYUVScaleTest, ARGBTest4x) {
               kFilterBilinear);
   }
 
-  ASSERT_NEAR(66, dest_pixels[0], 4);
-  ASSERT_NEAR(255 - 66, dest_pixels[1], 4);
-  ASSERT_NEAR(67, dest_pixels[2], 4);
-  ASSERT_NEAR(76, dest_pixels[3], 4);
+  EXPECT_NEAR(66, dest_pixels[0], 4);
+  EXPECT_NEAR(255 - 66, dest_pixels[1], 4);
+  EXPECT_NEAR(67, dest_pixels[2], 4);
+  EXPECT_NEAR(76, dest_pixels[3], 4);
 
   ARGBScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
             kFilterNone);
 
-  ASSERT_EQ(2, dest_pixels[0]);
-  ASSERT_EQ(255 - 2, dest_pixels[1]);
-  ASSERT_EQ(3, dest_pixels[2]);
-  ASSERT_EQ(12, dest_pixels[3]);
+  EXPECT_EQ(2, dest_pixels[0]);
+  EXPECT_EQ(255 - 2, dest_pixels[1]);
+  EXPECT_EQ(3, dest_pixels[2]);
+  EXPECT_EQ(12, dest_pixels[3]);
 
   free_aligned_buffer_page_end(dest_pixels);
   free_aligned_buffer_page_end(orig_pixels);
diff --git a/unit_test/scale_plane_test.cc b/unit_test/scale_plane_test.cc
index b04dda10c..979c70aad 100644
--- a/unit_test/scale_plane_test.cc
+++ b/unit_test/scale_plane_test.cc
@@ -42,108 +42,6 @@
 
 namespace libyuv {
 
-// POC: int row_stride = src_stride * 2 overflows to a small negative value
-// when src_stride is close to INT_MAX, causing src_ptr to walk backward
-// past the start of the source allocation on the second loop iteration.
-// With src_stride = 0x7FFFFFFE, row_stride = (int)0xFFFFFFFC = -4, so on
-// y=1 ScaleRowDown2Box reads 4 bytes before the heap allocation.
-TEST_F(LibYUVScaleTest, ScalePlaneDown2_RowStrideOverflow) {
-  constexpr int kSrcStride = 0x7FFFFFFE;  // INT_MAX - 1
-  constexpr int kSrcW = 64;
-  constexpr int kSrcH = 4;
-  constexpr int kDstW = 32;
-  constexpr int kDstH = 2;
-  // src_size = (kSrcH - 1) * stride + width.
-  size_t src_size = kSrcH - 1;
-  if (src_size > SIZE_MAX / kSrcStride) {
-    GTEST_SKIP() << "could not represent allocation size in size_t";
-  }
-  src_size *= kSrcStride;
-  if (src_size > SIZE_MAX - kSrcW) {
-    GTEST_SKIP() << "could not represent allocation size in size_t";
-  }
-  src_size += kSrcW;
-
-#if defined(__aarch64__)
-  // Infer malloc can accept a large size for cpu with dot product (a76/a55)
-  int has_large_malloc = TestCpuFlag(kCpuHasNeonDotProd);
-#else
-  int has_large_malloc = 1;
-#endif
-  if (!has_large_malloc) {
-    GTEST_SKIP() << "large allocation may assert for " << src_size << " bytes";
-  }
-
-  uint8_t* src = new (std::nothrow) uint8_t[src_size];
-  if (!src) {
-    GTEST_SKIP() << "could not allocate " << src_size << " bytes";
-  }
-  uint8_t dst[kDstW * kDstH];
-  uint8_t* src_row = src;
-  for (int i = 0; i < kSrcH; i++) {
-    memset(src_row, 0x41, kSrcW);
-    src_row += kSrcStride;
-  }
-  // Force the C row kernel: the SIMD kernels are inline asm that ASAN does not
-  // instrument, so they silently read OOB without a report.
-  MaskCpuFlags(1);
-  // 2*dst == src on both axes -> ScalePlane dispatches to ScalePlaneDown2.
-  // int row_stride = kSrcStride * 2 wraps to -4; on y=1 src_ptr underflows.
-  ScalePlane(src, kSrcStride, kSrcW, kSrcH, dst, kDstW, kDstW, kDstH,
-             kFilterBox);
-  MaskCpuFlags(0);
-  delete[] src;
-}
-
-// POC: same defect in the 1/4 fast path. src_stride = 0x3FFFFFFF gives
-// int row_stride = src_stride * 4 = (int)0xFFFFFFFC = -4.
-TEST_F(LibYUVScaleTest, ScalePlaneDown4_RowStrideOverflow) {
-  constexpr int kSrcStride = 0x3FFFFFFF;  // INT_MAX / 4 (rounded down)
-  constexpr int kSrcW = 64;
-  constexpr int kSrcH = 8;
-  constexpr int kDstW = 16;
-  constexpr int kDstH = 2;
-  // src_size = (kSrcH - 1) * stride + width.
-  size_t src_size = kSrcH - 1;
-  if (src_size > SIZE_MAX / kSrcStride) {
-    GTEST_SKIP() << "could not represent allocation size in size_t";
-  }
-  src_size *= kSrcStride;
-  if (src_size > SIZE_MAX - kSrcW) {
-    GTEST_SKIP() << "could not represent allocation size in size_t";
-  }
-  src_size += kSrcW;
-
-#if defined(__aarch64__)
-  // Infer malloc can accept a large size for cpu with dot product (a76/a55)
-  int has_large_malloc = TestCpuFlag(kCpuHasNeonDotProd);
-#else
-  int has_large_malloc = 1;
-#endif
-  if (!has_large_malloc) {
-    GTEST_SKIP() << "large allocation may assert for " << src_size << " bytes";
-  }
-
-  uint8_t* src = new (std::nothrow) uint8_t[src_size];
-  if (!src) {
-    GTEST_SKIP() << "could not allocate " << src_size << " bytes";
-  }
-  uint8_t dst[kDstW * kDstH];
-  uint8_t* src_row = src;
-  for (int i = 0; i < kSrcH; i++) {
-    memset(src_row, 0x41, kSrcW);
-    src_row += kSrcStride;
-  }
-  // Force the C row kernel: the SIMD kernels are inline asm that ASAN does not
-  // instrument, so they silently read OOB without a report.
-  MaskCpuFlags(1);
-  // 4*dst == src on both axes with kFilterBox -> ScalePlaneDown4.
-  ScalePlane(src, kSrcStride, kSrcW, kSrcH, dst, kDstW, kDstW, kDstH,
-             kFilterBox);
-  MaskCpuFlags(0);
-  delete[] src;
-}
-
 #ifdef ENABLE_ROW_TESTS
 #ifdef HAS_SCALEROWDOWN2_SSSE3
 TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
@@ -187,49 +85,49 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
     // Test regular half size.
     ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64);
 
-    ASSERT_EQ(64u, dst_pixels_c[0]);
-    ASSERT_EQ(25u, dst_pixels_c[1]);
-    ASSERT_EQ(13u, dst_pixels_c[2]);
-    ASSERT_EQ(5u, dst_pixels_c[3]);
-    ASSERT_EQ(0u, dst_pixels_c[4]);
-    ASSERT_EQ(133u, dst_pixels_c[63]);
+    EXPECT_EQ(64u, dst_pixels_c[0]);
+    EXPECT_EQ(25u, dst_pixels_c[1]);
+    EXPECT_EQ(13u, dst_pixels_c[2]);
+    EXPECT_EQ(5u, dst_pixels_c[3]);
+    EXPECT_EQ(0u, dst_pixels_c[4]);
+    EXPECT_EQ(133u, dst_pixels_c[63]);
 
     // Test Odd width version - Last pixel is just 1 horizontal pixel.
     ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
 
-    ASSERT_EQ(64u, dst_pixels_c[0]);
-    ASSERT_EQ(25u, dst_pixels_c[1]);
-    ASSERT_EQ(13u, dst_pixels_c[2]);
-    ASSERT_EQ(5u, dst_pixels_c[3]);
-    ASSERT_EQ(0u, dst_pixels_c[4]);
-    ASSERT_EQ(10u, dst_pixels_c[63]);
+    EXPECT_EQ(64u, dst_pixels_c[0]);
+    EXPECT_EQ(25u, dst_pixels_c[1]);
+    EXPECT_EQ(13u, dst_pixels_c[2]);
+    EXPECT_EQ(5u, dst_pixels_c[3]);
+    EXPECT_EQ(0u, dst_pixels_c[4]);
+    EXPECT_EQ(10u, dst_pixels_c[63]);
 
     // Test one pixel less, should skip the last pixel.
     memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
     ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63);
 
-    ASSERT_EQ(64u, dst_pixels_c[0]);
-    ASSERT_EQ(25u, dst_pixels_c[1]);
-    ASSERT_EQ(13u, dst_pixels_c[2]);
-    ASSERT_EQ(5u, dst_pixels_c[3]);
-    ASSERT_EQ(0u, dst_pixels_c[4]);
-    ASSERT_EQ(0u, dst_pixels_c[63]);
+    EXPECT_EQ(64u, dst_pixels_c[0]);
+    EXPECT_EQ(25u, dst_pixels_c[1]);
+    EXPECT_EQ(13u, dst_pixels_c[2]);
+    EXPECT_EQ(5u, dst_pixels_c[3]);
+    EXPECT_EQ(0u, dst_pixels_c[4]);
+    EXPECT_EQ(0u, dst_pixels_c[63]);
 
     // Test regular half size SSSE3.
     ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
 
-    ASSERT_EQ(64u, dst_pixels_opt[0]);
-    ASSERT_EQ(25u, dst_pixels_opt[1]);
-    ASSERT_EQ(13u, dst_pixels_opt[2]);
-    ASSERT_EQ(5u, dst_pixels_opt[3]);
-    ASSERT_EQ(0u, dst_pixels_opt[4]);
-    ASSERT_EQ(133u, dst_pixels_opt[63]);
+    EXPECT_EQ(64u, dst_pixels_opt[0]);
+    EXPECT_EQ(25u, dst_pixels_opt[1]);
+    EXPECT_EQ(13u, dst_pixels_opt[2]);
+    EXPECT_EQ(5u, dst_pixels_opt[3]);
+    EXPECT_EQ(0u, dst_pixels_opt[4]);
+    EXPECT_EQ(133u, dst_pixels_opt[63]);
 
     // Compare C and SSSE3 match.
     ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
     ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
     for (int i = 0; i < 64; ++i) {
-      ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
     }
   }
 }
@@ -262,11 +160,11 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
   }
 
   for (int i = 0; i < 1280; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
-  ASSERT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4);
-  ASSERT_EQ(dst_pixels_c[1279], 3839);
+  EXPECT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4);
+  EXPECT_EQ(dst_pixels_c[1279], 3839);
 }
 #endif  // ENABLE_ROW_TESTS
 
@@ -346,7 +244,7 @@ static int TestPlaneFilter_16(int src_width,
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),   \
         kFilter##filter, benchmark_iterations_, disable_cpu_flags_,            \
         benchmark_cpu_info_);                                                  \
-    ASSERT_LE(diff, max_diff);                                                 \
+    EXPECT_LE(diff, max_diff);                                                 \
   }
 
 // Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
@@ -385,12 +283,12 @@ TEST_F(LibYUVScaleTest, PlaneTest3x) {
                kFilterBilinear);
   }
 
-  ASSERT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(225, dest_pixels[0]);
 
   ScalePlane(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
              kFilterNone);
 
-  ASSERT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(225, dest_pixels[0]);
 
   free_aligned_buffer_page_end(dest_pixels);
   free_aligned_buffer_page_end(orig_pixels);
@@ -413,12 +311,12 @@ TEST_F(LibYUVScaleTest, PlaneTest4x) {
                kFilterBilinear);
   }
 
-  ASSERT_EQ(66, dest_pixels[0]);
+  EXPECT_EQ(66, dest_pixels[0]);
 
   ScalePlane(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
              kFilterNone);
 
-  ASSERT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
+  EXPECT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
 
   free_aligned_buffer_page_end(dest_pixels);
   free_aligned_buffer_page_end(orig_pixels);
@@ -447,7 +345,7 @@ TEST_F(LibYUVScaleTest, PlaneTestRotate_None) {
   }
 
   for (int i = 0; i < kSize; ++i) {
-    ASSERT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
   }
 
   free_aligned_buffer_page_end(dest_c_pixels);
@@ -477,7 +375,7 @@ TEST_F(LibYUVScaleTest, PlaneTestRotate_Bilinear) {
   }
 
   for (int i = 0; i < kSize; ++i) {
-    ASSERT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
   }
 
   free_aligned_buffer_page_end(dest_c_pixels);
@@ -508,7 +406,7 @@ TEST_F(LibYUVScaleTest, PlaneTestRotate_Box) {
   }
 
   for (int i = 0; i < kSize; ++i) {
-    ASSERT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
   }
 
   free_aligned_buffer_page_end(dest_c_pixels);
@@ -534,9 +432,9 @@ TEST_F(LibYUVScaleTest, PlaneTest1_Box) {
                      /* dst_width= */ 1, /* dst_height= */ 2,
                      libyuv::kFilterBox);
 
-  ASSERT_EQ(dst_pixels[0], 1);
-  ASSERT_EQ(dst_pixels[1], 1);
-  ASSERT_EQ(dst_pixels[2], 3);
+  EXPECT_EQ(dst_pixels[0], 1);
+  EXPECT_EQ(dst_pixels[1], 1);
+  EXPECT_EQ(dst_pixels[2], 3);
 
   free_aligned_buffer_page_end(dst_pixels);
   free_aligned_buffer_page_end(orig_pixels);
@@ -562,9 +460,9 @@ TEST_F(LibYUVScaleTest, PlaneTest1_16_Box) {
       /* src_height= */ 1, dst_pixels, /* dst_stride= */ 1,
       /* dst_width= */ 1, /* dst_height= */ 2, libyuv::kFilterNone);
 
-  ASSERT_EQ(dst_pixels[0], 1);
-  ASSERT_EQ(dst_pixels[1], 1);
-  ASSERT_EQ(dst_pixels[2], 3);
+  EXPECT_EQ(dst_pixels[0], 1);
+  EXPECT_EQ(dst_pixels[1], 1);
+  EXPECT_EQ(dst_pixels[2], 3);
 
   free_aligned_buffer_page_end(dst_pixels_alloc);
   free_aligned_buffer_page_end(orig_pixels_alloc);
@@ -631,58 +529,9 @@ TEST_F(LibYUVScaleTest, ScalePlaneVertical_IntStrideOverflow) {
                      kDstHeight, kFilterNone);
 
   // Not reached under ASAN.
-  ASSERT_EQ(0, r);
+  EXPECT_EQ(0, r);
   delete[] src;
   delete[] dst;
 }
 
-TEST_F(LibYUVScaleTest, ScalePlane_InvalidInputs) {
-  uint8_t src[16] = {0};
-  uint8_t dst[16] = {0};
-
-  // NULL src/dst
-  EXPECT_EQ(-1, ScalePlane(nullptr, 4, 4, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, 4, nullptr, 4, 4, 4, kFilterNone));
-
-  // Width/height <= 0 (except src_height which can be negative but not 0)
-  EXPECT_EQ(-1, ScalePlane(src, 4, 0, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, -1, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, 0, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, 4, dst, 4, 0, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, 4, dst, 4, -1, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, 4, dst, 4, 4, 0, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, 4, dst, 4, 4, -1, kFilterNone));
-
-  // Width/height too large (> 32768)
-  EXPECT_EQ(-1, ScalePlane(src, 4, 32769, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, 32769, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, -32769, dst, 4, 4, 4, kFilterNone));
-
-  // Valid edge cases
-  EXPECT_EQ(0, ScalePlane(src, 4, 1, 1, dst, 4, 1, 1, kFilterNone));
-  EXPECT_EQ(0, ScalePlane(src, 4, 1, -1, dst, 4, 1, 1, kFilterNone));
-}
-
-TEST_F(LibYUVScaleTest, ScalePlane_16_InvalidInputs) {
-  uint16_t src[16] = {0};
-  uint16_t dst[16] = {0};
-
-  EXPECT_EQ(-1, ScalePlane_16(nullptr, 4, 4, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_16(src, 4, 4, 4, nullptr, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_16(src, 4, 0, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_16(src, 4, 32769, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_16(src, 4, 4, -32769, dst, 4, 4, 4, kFilterNone));
-}
-
-TEST_F(LibYUVScaleTest, ScalePlane_12_InvalidInputs) {
-  uint16_t src[16] = {0};
-  uint16_t dst[16] = {0};
-
-  EXPECT_EQ(-1, ScalePlane_12(nullptr, 4, 4, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_12(src, 4, 4, 4, nullptr, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_12(src, 4, 0, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_12(src, 4, 32769, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_12(src, 4, 4, -32769, dst, 4, 4, 4, kFilterNone));
-}
-
 }  // namespace libyuv
diff --git a/unit_test/scale_rgb_test.cc b/unit_test/scale_rgb_test.cc
index f6fa1e8ca..8296abe31 100644
--- a/unit_test/scale_rgb_test.cc
+++ b/unit_test/scale_rgb_test.cc
@@ -128,7 +128,7 @@ static int RGBTestFilter(int src_width,
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
         kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
         benchmark_cpu_info_);                                                \
-    ASSERT_LE(diff, max_diff);                                               \
+    EXPECT_LE(diff, max_diff);                                               \
   }
 
 #if defined(ENABLE_FULL_TESTS)
@@ -163,14 +163,14 @@ TEST_FACTOR(3, 1, 3)
     int diff = RGBTestFilter(benchmark_width_, benchmark_height_, width,     \
                              height, kFilter##filter, benchmark_iterations_, \
                              disable_cpu_flags_, benchmark_cpu_info_);       \
-    ASSERT_LE(diff, max_diff);                                               \
+    EXPECT_LE(diff, max_diff);                                               \
   }                                                                          \
   TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {         \
     int diff = RGBTestFilter(width, height, Abs(benchmark_width_),           \
                              Abs(benchmark_height_), kFilter##filter,        \
                              benchmark_iterations_, disable_cpu_flags_,      \
                              benchmark_cpu_info_);                           \
-    ASSERT_LE(diff, max_diff);                                               \
+    EXPECT_LE(diff, max_diff);                                               \
   }
 
 #if defined(ENABLE_FULL_TESTS)
@@ -202,7 +202,7 @@ TEST_SCALETO(RGBScale, 1920, 1080)
                              benchmark_height_, benchmark_width_,      \
                              kFilter##filter, benchmark_iterations_,   \
                              disable_cpu_flags_, benchmark_cpu_info_); \
-    ASSERT_LE(diff, max_diff);                                         \
+    EXPECT_LE(diff, max_diff);                                         \
   }
 
 #if defined(ENABLE_FULL_TESTS)
@@ -233,14 +233,14 @@ TEST_F(LibYUVScaleTest, RGBTest3x) {
              kFilterBilinear);
   }
 
-  ASSERT_EQ(225, dest_pixels[0]);
-  ASSERT_EQ(255 - 225, dest_pixels[1]);
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);
 
   RGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
            kFilterNone);
 
-  ASSERT_EQ(225, dest_pixels[0]);
-  ASSERT_EQ(255 - 225, dest_pixels[1]);
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);
 
   free_aligned_buffer_page_end(dest_pixels);
   free_aligned_buffer_page_end(orig_pixels);
@@ -264,14 +264,14 @@ TEST_F(LibYUVScaleTest, RGBTest4x) {
              kFilterBilinear);
   }
 
-  ASSERT_EQ(66, dest_pixels[0]);
-  ASSERT_EQ(190, dest_pixels[1]);
+  EXPECT_EQ(66, dest_pixels[0]);
+  EXPECT_EQ(190, dest_pixels[1]);
 
   RGBScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
            kFilterNone);
 
-  ASSERT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
-  ASSERT_EQ(255 - 2, dest_pixels[1]);
+  EXPECT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
+  EXPECT_EQ(255 - 2, dest_pixels[1]);
 
   free_aligned_buffer_page_end(dest_pixels);
   free_aligned_buffer_page_end(orig_pixels);
diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc
index 323094f3f..299fd2381 100644
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -757,7 +757,7 @@ static int NV12TestFilter(int src_width,
   int src_height_uv = (Abs(src_height) + 1) >> 1;
 
   int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
-  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv) * 2;
+  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv)*2;
 
   int src_stride_y = Abs(src_width);
   int src_stride_uv = src_width_uv * 2;
@@ -775,7 +775,7 @@ static int NV12TestFilter(int src_width,
   int dst_height_uv = (dst_height + 1) >> 1;
 
   int64_t dst_y_plane_size = (dst_width) * (dst_height);
-  int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv) * 2;
+  int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv)*2;
 
   int dst_stride_y = dst_width;
   int dst_stride_uv = dst_width_uv * 2;
@@ -856,7 +856,7 @@ static int NV12TestFilter(int src_width,
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
         kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
         benchmark_cpu_info_);                                                 \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) {                 \
     int diff = I444TestFilter(                                                \
@@ -864,7 +864,7 @@ static int NV12TestFilter(int src_width,
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
         kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
         benchmark_cpu_info_);                                                 \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest, DISABLED_##I420ScaleDownBy##name##_##filter##_12) { \
     int diff = I420TestFilter_12(                                             \
@@ -872,7 +872,7 @@ static int NV12TestFilter(int src_width,
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
         kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
         benchmark_cpu_info_);                                                 \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest, DISABLED_##I444ScaleDownBy##name##_##filter##_12) { \
     int diff = I444TestFilter_12(                                             \
@@ -880,7 +880,7 @@ static int NV12TestFilter(int src_width,
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
         kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
         benchmark_cpu_info_);                                                 \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest, NV12ScaleDownBy##name##_##filter) {                 \
     int diff = NV12TestFilter(                                                \
@@ -888,7 +888,7 @@ static int NV12TestFilter(int src_width,
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
         kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
         benchmark_cpu_info_);                                                 \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }
 
 // Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
@@ -931,61 +931,61 @@ TEST_FACTOR(3, 1, 3, 0)
     int diff = I420TestFilter(benchmark_width_, benchmark_height_, width,     \
                               height, kFilter##filter, benchmark_iterations_, \
                               disable_cpu_flags_, benchmark_cpu_info_);       \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest, I444##name##To##width##x##height##_##filter) {      \
     int diff = I444TestFilter(benchmark_width_, benchmark_height_, width,     \
                               height, kFilter##filter, benchmark_iterations_, \
                               disable_cpu_flags_, benchmark_cpu_info_);       \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest,                                                     \
          DISABLED_##I420##name##To##width##x##height##_##filter##_12) {       \
     int diff = I420TestFilter_12(                                             \
         benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
         benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest,                                                     \
          DISABLED_##I444##name##To##width##x##height##_##filter##_12) {       \
     int diff = I444TestFilter_12(                                             \
         benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
         benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest,                                                     \
          DISABLED_##I420##name##To##width##x##height##_##filter##_16) {       \
     int diff = I420TestFilter_16(                                             \
         benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
         benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest,                                                     \
          DISABLED_##I444##name##To##width##x##height##_##filter##_16) {       \
     int diff = I444TestFilter_16(                                             \
         benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
         benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest, NV12##name##To##width##x##height##_##filter) {      \
     int diff = NV12TestFilter(benchmark_width_, benchmark_height_, width,     \
                               height, kFilter##filter, benchmark_iterations_, \
                               disable_cpu_flags_, benchmark_cpu_info_);       \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest, I420##name##From##width##x##height##_##filter) {    \
     int diff = I420TestFilter(width, height, Abs(benchmark_width_),           \
                               Abs(benchmark_height_), kFilter##filter,        \
                               benchmark_iterations_, disable_cpu_flags_,      \
                               benchmark_cpu_info_);                           \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest, I444##name##From##width##x##height##_##filter) {    \
     int diff = I444TestFilter(width, height, Abs(benchmark_width_),           \
                               Abs(benchmark_height_), kFilter##filter,        \
                               benchmark_iterations_, disable_cpu_flags_,      \
                               benchmark_cpu_info_);                           \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest,                                                     \
          DISABLED_##I420##name##From##width##x##height##_##filter##_12) {     \
@@ -993,7 +993,7 @@ TEST_FACTOR(3, 1, 3, 0)
                                  Abs(benchmark_height_), kFilter##filter,     \
                                  benchmark_iterations_, disable_cpu_flags_,   \
                                  benchmark_cpu_info_);                        \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest,                                                     \
          DISABLED_##I444##name##From##width##x##height##_##filter##_12) {     \
@@ -1001,7 +1001,7 @@ TEST_FACTOR(3, 1, 3, 0)
                                  Abs(benchmark_height_), kFilter##filter,     \
                                  benchmark_iterations_, disable_cpu_flags_,   \
                                  benchmark_cpu_info_);                        \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest,                                                     \
          DISABLED_##I420##name##From##width##x##height##_##filter##_16) {     \
@@ -1009,7 +1009,7 @@ TEST_FACTOR(3, 1, 3, 0)
                                  Abs(benchmark_height_), kFilter##filter,     \
                                  benchmark_iterations_, disable_cpu_flags_,   \
                                  benchmark_cpu_info_);                        \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest,                                                     \
          DISABLED_##I444##name##From##width##x##height##_##filter##_16) {     \
@@ -1017,14 +1017,14 @@ TEST_FACTOR(3, 1, 3, 0)
                                  Abs(benchmark_height_), kFilter##filter,     \
                                  benchmark_iterations_, disable_cpu_flags_,   \
                                  benchmark_cpu_info_);                        \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest, NV12##name##From##width##x##height##_##filter) {    \
     int diff = NV12TestFilter(width, height, Abs(benchmark_width_),           \
                               Abs(benchmark_height_), kFilter##filter,        \
                               benchmark_iterations_, disable_cpu_flags_,      \
                               benchmark_cpu_info_);                           \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
   }
 
 #ifndef DISABLE_SLOW_TESTS
@@ -1068,49 +1068,49 @@ TEST_SCALETO(Scale, 1080, 1920)  // for rotated phones
                               benchmark_height_, benchmark_width_,         \
                               kFilter##filter, benchmark_iterations_,      \
                               disable_cpu_flags_, benchmark_cpu_info_);    \
-    ASSERT_LE(diff, max_diff);                                             \
+    EXPECT_LE(diff, max_diff);                                             \
   }                                                                        \
   TEST_F(LibYUVScaleTest, I444##name##SwapXY_##filter) {                   \
     int diff = I444TestFilter(benchmark_width_, benchmark_height_,         \
                               benchmark_height_, benchmark_width_,         \
                               kFilter##filter, benchmark_iterations_,      \
                               disable_cpu_flags_, benchmark_cpu_info_);    \
-    ASSERT_LE(diff, max_diff);                                             \
+    EXPECT_LE(diff, max_diff);                                             \
   }                                                                        \
   TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_12) {   \
     int diff = I420TestFilter_12(benchmark_width_, benchmark_height_,      \
                                  benchmark_height_, benchmark_width_,      \
                                  kFilter##filter, benchmark_iterations_,   \
                                  disable_cpu_flags_, benchmark_cpu_info_); \
-    ASSERT_LE(diff, max_diff);                                             \
+    EXPECT_LE(diff, max_diff);                                             \
   }                                                                        \
   TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_12) {   \
     int diff = I444TestFilter_12(benchmark_width_, benchmark_height_,      \
                                  benchmark_height_, benchmark_width_,      \
                                  kFilter##filter, benchmark_iterations_,   \
                                  disable_cpu_flags_, benchmark_cpu_info_); \
-    ASSERT_LE(diff, max_diff);                                             \
+    EXPECT_LE(diff, max_diff);                                             \
   }                                                                        \
   TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_16) {   \
     int diff = I420TestFilter_16(benchmark_width_, benchmark_height_,      \
                                  benchmark_height_, benchmark_width_,      \
                                  kFilter##filter, benchmark_iterations_,   \
                                  disable_cpu_flags_, benchmark_cpu_info_); \
-    ASSERT_LE(diff, max_diff);                                             \
+    EXPECT_LE(diff, max_diff);                                             \
   }                                                                        \
   TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_16) {   \
     int diff = I444TestFilter_16(benchmark_width_, benchmark_height_,      \
                                  benchmark_height_, benchmark_width_,      \
                                  kFilter##filter, benchmark_iterations_,   \
                                  disable_cpu_flags_, benchmark_cpu_info_); \
-    ASSERT_LE(diff, max_diff);                                             \
+    EXPECT_LE(diff, max_diff);                                             \
   }                                                                        \
   TEST_F(LibYUVScaleTest, NV12##name##SwapXY_##filter) {                   \
     int diff = NV12TestFilter(benchmark_width_, benchmark_height_,         \
                               benchmark_height_, benchmark_width_,         \
                               kFilter##filter, benchmark_iterations_,      \
                               disable_cpu_flags_, benchmark_cpu_info_);    \
-    ASSERT_LE(diff, max_diff);                                             \
+    EXPECT_LE(diff, max_diff);                                             \
   }
 
 // Test scale to a specified size with all 4 filters.
diff --git a/unit_test/scale_uv_test.cc b/unit_test/scale_uv_test.cc
index df1e4c54c..dab217c97 100644
--- a/unit_test/scale_uv_test.cc
+++ b/unit_test/scale_uv_test.cc
@@ -101,7 +101,7 @@ static int UVTestFilter(int src_width,
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
         kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
         benchmark_cpu_info_);                                                \
-    ASSERT_EQ(0, diff);                                                      \
+    EXPECT_EQ(0, diff);                                                      \
   }
 
 #if defined(ENABLE_FULL_TESTS)
@@ -132,14 +132,14 @@ TEST_FACTOR(3, 1, 3)
     int diff = UVTestFilter(benchmark_width_, benchmark_height_, width,     \
                             height, kFilter##filter, benchmark_iterations_, \
                             disable_cpu_flags_, benchmark_cpu_info_);       \
-    ASSERT_LE(diff, max_diff);                                              \
+    EXPECT_LE(diff, max_diff);                                              \
   }                                                                         \
   TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {        \
     int diff = UVTestFilter(width, height, Abs(benchmark_width_),           \
                             Abs(benchmark_height_), kFilter##filter,        \
                             benchmark_iterations_, disable_cpu_flags_,      \
                             benchmark_cpu_info_);                           \
-    ASSERT_LE(diff, max_diff);                                              \
+    EXPECT_LE(diff, max_diff);                                              \
   }
 
 #if defined(ENABLE_FULL_TESTS)
@@ -171,7 +171,7 @@ TEST_SCALETO(UVScale, 1920, 1080)
         UVTestFilter(benchmark_width_, benchmark_height_, benchmark_height_,   \
                      benchmark_width_, kFilter##filter, benchmark_iterations_, \
                      disable_cpu_flags_, benchmark_cpu_info_);                 \
-    ASSERT_LE(diff, max_diff);                                                 \
+    EXPECT_LE(diff, max_diff);                                                 \
   }
 
 #if defined(ENABLE_FULL_TESTS)
@@ -202,14 +202,14 @@ TEST_F(LibYUVScaleTest, UVTest3x) {
             kFilterBilinear);
   }
 
-  ASSERT_EQ(225, dest_pixels[0]);
-  ASSERT_EQ(255 - 225, dest_pixels[1]);
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);
 
   UVScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
           kFilterNone);
 
-  ASSERT_EQ(225, dest_pixels[0]);
-  ASSERT_EQ(255 - 225, dest_pixels[1]);
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);
 
   free_aligned_buffer_page_end(dest_pixels);
   free_aligned_buffer_page_end(orig_pixels);
@@ -233,14 +233,14 @@ TEST_F(LibYUVScaleTest, UVTest4x) {
             kFilterBilinear);
   }
 
-  ASSERT_EQ(66, dest_pixels[0]);
-  ASSERT_EQ(190, dest_pixels[1]);
+  EXPECT_EQ(66, dest_pixels[0]);
+  EXPECT_EQ(190, dest_pixels[1]);
 
   UVScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
           kFilterNone);
 
-  ASSERT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
-  ASSERT_EQ(255 - 2, dest_pixels[1]);
+  EXPECT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
+  EXPECT_EQ(255 - 2, dest_pixels[1]);
 
   free_aligned_buffer_page_end(dest_pixels);
   free_aligned_buffer_page_end(orig_pixels);
diff --git a/unit_test/unit_test.cc b/unit_test/unit_test.cc
index 163e3ffdb..b737a0321 100644
--- a/unit_test/unit_test.cc
+++ b/unit_test/unit_test.cc
@@ -169,6 +169,9 @@ static int TestCpuEnv(int cpu_info) {
   if (TestEnv("LIBYUV_DISABLE_AMXINT8")) {
     cpu_info &= ~libyuv::kCpuHasAMXINT8;
   }
+  if (TestEnv("LIBYUV_DISABLE_AVX512BMM")) {
+    cpu_info &= ~libyuv::kCpuHasAVX512BMM;
+  }
 #endif
   if (TestEnv("LIBYUV_DISABLE_ASM")) {
     cpu_info = libyuv::kCpuInitialized;
diff --git a/unit_test/unit_test.h b/unit_test/unit_test.h
index e9a55c62f..2c11c983f 100644
--- a/unit_test/unit_test.h
+++ b/unit_test/unit_test.h
@@ -85,11 +85,10 @@ static inline bool SizeValid(int src_width,
 #define align_buffer_page_end_16(var, size)                                 \
   uint16_t* var = NULL;                                                     \
   uint8_t* var##_mem =                                                      \
-      reinterpret_cast<uint8_t*>(malloc(((size) * 2 + 4095 + 63) & ~4095)); \
+      reinterpret_cast<uint8_t*>(malloc(((size)*2 + 4095 + 63) & ~4095));   \
   if (var##_mem)                                                            \
   var = reinterpret_cast<uint16_t*>(                                        \
-      (intptr_t)(var##_mem + (((size) * 2 + 4095 + 63) & ~4095) -           \
-                 (size) * 2) &                                              \
+      (intptr_t)(var##_mem + (((size)*2 + 4095 + 63) & ~4095) - (size)*2) & \
       ~63)
 
 #define free_aligned_buffer_page_end_16(var) \
diff --git a/unit_test/video_common_test.cc b/unit_test/video_common_test.cc
index 9ff99faac..36728ea90 100644
--- a/unit_test/video_common_test.cc
+++ b/unit_test/video_common_test.cc
@@ -36,77 +36,77 @@ static bool TestValidFourCC(uint32_t fourcc, int bpp) {
 }
 
 TEST_F(LibYUVBaseTest, TestCanonicalFourCC) {
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_I420), CanonicalFourCC(FOURCC_IYUV));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_I420), CanonicalFourCC(FOURCC_YU12));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_I422), CanonicalFourCC(FOURCC_YU16));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_I444), CanonicalFourCC(FOURCC_YU24));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUYV));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUVS));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_UYVY), CanonicalFourCC(FOURCC_HDYC));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_UYVY), CanonicalFourCC(FOURCC_2VUY));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_MJPG), CanonicalFourCC(FOURCC_JPEG));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_MJPG), CanonicalFourCC(FOURCC_DMB1));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_RAW), CanonicalFourCC(FOURCC_RGB3));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_24BG), CanonicalFourCC(FOURCC_BGR3));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_BGRA), CanonicalFourCC(FOURCC_CM32));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_RAW), CanonicalFourCC(FOURCC_CM24));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_RGBO), CanonicalFourCC(FOURCC_L555));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_RGBP), CanonicalFourCC(FOURCC_L565));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_RGBO), CanonicalFourCC(FOURCC_5551));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_I420), CanonicalFourCC(FOURCC_IYUV));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_I420), CanonicalFourCC(FOURCC_YU12));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_I422), CanonicalFourCC(FOURCC_YU16));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_I444), CanonicalFourCC(FOURCC_YU24));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUYV));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUVS));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_UYVY), CanonicalFourCC(FOURCC_HDYC));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_UYVY), CanonicalFourCC(FOURCC_2VUY));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_MJPG), CanonicalFourCC(FOURCC_JPEG));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_MJPG), CanonicalFourCC(FOURCC_DMB1));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_RAW), CanonicalFourCC(FOURCC_RGB3));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_24BG), CanonicalFourCC(FOURCC_BGR3));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_BGRA), CanonicalFourCC(FOURCC_CM32));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_RAW), CanonicalFourCC(FOURCC_CM24));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_RGBO), CanonicalFourCC(FOURCC_L555));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_RGBP), CanonicalFourCC(FOURCC_L565));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_RGBO), CanonicalFourCC(FOURCC_5551));
 }
 
 TEST_F(LibYUVBaseTest, TestFourCC) {
-  ASSERT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_I422, FOURCC_BPP_I422));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_I444, FOURCC_BPP_I444));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_I400, FOURCC_BPP_I400));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_NV21, FOURCC_BPP_NV21));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));  // deprecated.
-  ASSERT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420));  // deprecated.
-  ASSERT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_AR30, FOURCC_BPP_AR30));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_AB30, FOURCC_BPP_AB30));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_AR64, FOURCC_BPP_AR64));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_AB64, FOURCC_BPP_AB64));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_R444, FOURCC_BPP_R444));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_H420, FOURCC_BPP_H420));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_H422, FOURCC_BPP_H422));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_H010, FOURCC_BPP_H010));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_H210, FOURCC_BPP_H210));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_I010, FOURCC_BPP_I010));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_I210, FOURCC_BPP_I210));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_P010, FOURCC_BPP_P010));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_P210, FOURCC_BPP_P210));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YV24, FOURCC_BPP_YV24));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YU12, FOURCC_BPP_YU12));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_IYUV, FOURCC_BPP_IYUV));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YU16, FOURCC_BPP_YU16));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YU24, FOURCC_BPP_YU24));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YUYV, FOURCC_BPP_YUYV));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YUVS, FOURCC_BPP_YUVS));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_HDYC, FOURCC_BPP_HDYC));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_2VUY, FOURCC_BPP_2VUY));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_JPEG, FOURCC_BPP_JPEG));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_DMB1, FOURCC_BPP_DMB1));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_BA81, FOURCC_BPP_BA81));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_RGB3, FOURCC_BPP_RGB3));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_BGR3, FOURCC_BPP_BGR3));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_H264, FOURCC_BPP_H264));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_ANY, FOURCC_BPP_ANY));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I422, FOURCC_BPP_I422));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I444, FOURCC_BPP_I444));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I400, FOURCC_BPP_I400));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_NV21, FOURCC_BPP_NV21));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));  // deprecated.
+  EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420));  // deprecated.
+  EXPECT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_AR30, FOURCC_BPP_AR30));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_AB30, FOURCC_BPP_AB30));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_AR64, FOURCC_BPP_AR64));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_AB64, FOURCC_BPP_AB64));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_R444, FOURCC_BPP_R444));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H420, FOURCC_BPP_H420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H422, FOURCC_BPP_H422));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H010, FOURCC_BPP_H010));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H210, FOURCC_BPP_H210));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I010, FOURCC_BPP_I010));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I210, FOURCC_BPP_I210));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_P010, FOURCC_BPP_P010));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_P210, FOURCC_BPP_P210));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YV24, FOURCC_BPP_YV24));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YU12, FOURCC_BPP_YU12));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_IYUV, FOURCC_BPP_IYUV));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YU16, FOURCC_BPP_YU16));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YU24, FOURCC_BPP_YU24));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YUYV, FOURCC_BPP_YUYV));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YUVS, FOURCC_BPP_YUVS));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_HDYC, FOURCC_BPP_HDYC));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_2VUY, FOURCC_BPP_2VUY));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_JPEG, FOURCC_BPP_JPEG));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_DMB1, FOURCC_BPP_DMB1));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_BA81, FOURCC_BPP_BA81));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGB3, FOURCC_BPP_RGB3));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_BGR3, FOURCC_BPP_BGR3));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H264, FOURCC_BPP_H264));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_ANY, FOURCC_BPP_ANY));
 }
 
 }  // namespace libyuv
diff --git a/util/cpuid.c b/util/cpuid.c
index 38b2c0e9d..bbaea8398 100644
--- a/util/cpuid.c
+++ b/util/cpuid.c
@@ -15,6 +15,8 @@
 #ifdef __linux__
 #include <ctype.h>
 #include <sys/utsname.h>
+#include <signal.h>
+#include <setjmp.h>
 #endif
 
 #include "libyuv/cpu_id.h"
@@ -40,6 +42,14 @@ static void KernelVersion(int* version) {
 }
 #endif
 
+#ifdef __linux__
+static sigjmp_buf vdpphps_jmpbuf;
+static void vdpphps_sigill_handler(int sig) {
+  (void)sig;
+  siglongjmp(vdpphps_jmpbuf, 1);
+}
+#endif
+
 int main(int argc, const char* argv[]) {
   (void)argc;
   (void)argv;
@@ -182,6 +192,7 @@ int main(int argc, const char* argv[]) {
     int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI);
     int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8);
     int has_amxint8 = TestCpuFlag(kCpuHasAMXINT8);
+    int has_avx512bmm = TestCpuFlag(kCpuHasAVX512BMM);
     printf("Has X86 0x%x\n", has_x86);
     printf("Has SSE2 0x%x\n", has_sse2);
     printf("Has SSSE3 0x%x\n", has_ssse3);
@@ -204,6 +215,30 @@ int main(int argc, const char* argv[]) {
     printf("HAS AVXVNNI 0x%x\n", has_avxvnni);
     printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
     printf("Has AMXINT8 0x%x\n", has_amxint8);
+    printf("Has AVX512BMM 0x%x\n", has_avx512bmm);
+
+#ifdef __linux__
+    // Test VDPPHPS instruction
+    {
+      struct sigaction act, oldact;
+      memset(&act, 0, sizeof(act));
+      act.sa_handler = vdpphps_sigill_handler;
+      sigaction(SIGILL, &act, &oldact);
+
+      printf("Testing VDPPHPS instruction... ");
+      fflush(stdout);
+
+      if (sigsetjmp(vdpphps_jmpbuf, 1) == 0) {
+        // VDPPHPS xmm0, xmm0, xmm0
+        __asm__ volatile("vdpphps %%xmm0, %%xmm0, %%xmm0" : : : "xmm0");
+        printf("Works!\n");
+      } else {
+        printf("Crashed (SIGILL)!\n");
+      }
+
+      sigaction(SIGILL, &oldact, NULL);
+    }
+#endif
   }
 #endif  // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) ||
         // defined(_M_X64)
diff --git a/util/ssim.cc b/util/ssim.cc
index f8b4509f8..096fbcf06 100644
--- a/util/ssim.cc
+++ b/util/ssim.cc
@@ -244,23 +244,23 @@ double GetSSIMFullKernel(const uint8_t* org,
 
 // Read 8 pixels at line #L, and convert to 16bit, perform weighting
 // and acccumulate.
-#define LOAD_LINE_PAIR(L, WEIGHT)                                              \
-  do {                                                                         \
-    const __m128i v0 =                                                         \
-        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
-    const __m128i v1 =                                                         \
-        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
-    const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                            \
-    const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                            \
-    const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);              \
-    const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);              \
-    x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                       \
-    y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                       \
-    x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                       \
-    y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                       \
-    xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                           \
-    xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                           \
-    yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                           \
+#define LOAD_LINE_PAIR(L, WEIGHT)                                            \
+  do {                                                                       \
+    const __m128i v0 =                                                       \
+        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \
+    const __m128i v1 =                                                       \
+        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \
+    const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                          \
+    const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                          \
+    const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);            \
+    const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);            \
+    x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                     \
+    y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                     \
+    x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                     \
+    y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                     \
+    xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                         \
+    xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                         \
+    yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                         \
   } while (0)
 
 #define ADD_AND_STORE_FOUR_EPI32(M, OUT)                    \