add bmm detect and vdpphps in util/cpuid

Bug: None Change-Id: I9954f96a74e653e3ecd3fbeba533299fa8e57d95
2026-06-15 08:26:06 +08:00 · 2026-06-09 11:30:45 -07:00 · 2026-06-09 11:30:45 -07:00 · ca577883ae
commit ca577883ae
parent 3bdb3b94ca
69 changed files with 8440 additions and 8659 deletions
--- a/Android.mk
+++ b/Android.mk
@ -1,7 +1,4 @@
 # This is the Android makefile for libyuv for NDK.
-
-# Ignore this file during non-NDK builds.
-ifdef NDK_ROOT
 LOCAL_PATH:= $(call my-dir)

 include $(CLEAR_VARS)
@ -107,4 +104,3 @@ LOCAL_SRC_FILES := \

 LOCAL_MODULE := libyuv_unittest
 include $(BUILD_NATIVE_TEST)
-endif  # NDK_ROOT
--- a/BUILD.gn
+++ b/BUILD.gn
@ -22,6 +22,13 @@ declare_args() {

 config("libyuv_config") {
  include_dirs = [ "include" ]
+  if (is_android) {
+    if (target_cpu == "arm" || target_cpu == "x86") {
+      ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ]
+    } else {
+      ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ]
+    }
+  }

  # Define CHROMIUM to tell cpu_id to avoid sandbox unsafe system calls.
  defines = [ "CHROMIUM" ]
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1948
+Version: 1937
 Revision: DEPS
 License: BSD-3-Clause
 License File: LICENSE
--- a/docs/environment_variables.md
+++ b/docs/environment_variables.md
@ -33,6 +33,7 @@ By default the cpu is detected and the most advanced form of SIMD is used.  But
    LIBYUV_DISABLE_AVXVNNI
    LIBYUV_DISABLE_AVXVNNIINT8
    LIBYUV_DISABLE_AMXINT8
+    LIBYUV_DISABLE_AVX512BMM

 ## Arm CPUs

--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@ -72,6 +72,7 @@ Additional commonly used compiler options can be passed to Bazel via `--copt`:

    bazel build -c opt --config=android_arm64 \
        --copt=-DLIBYUV_UNLIMITED_DATA \
+        --copt=-DLIBYUV_BIT_EXACT=1 \
        --copt=-DENABLE_ROW_TESTS \
        //:libyuv_test

--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@ -888,45 +888,6 @@ int ABGRToI420(const uint8_t* src_abgr,
               int width,
               int height);

-// BGRA little endian (argb in memory) to I422.
-LIBYUV_API
-int BGRAToI422(const uint8_t* src_bgra,
-               int src_stride_bgra,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// ABGR little endian (rgba in memory) to I422.
-LIBYUV_API
-int ABGRToI422(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
-// RGBA little endian (abgr in memory) to I422.
-LIBYUV_API
-int RGBAToI422(const uint8_t* src_rgba,
-               int src_stride_rgba,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
 // RGBA little endian (abgr in memory) to I420.
 LIBYUV_API
 int RGBAToI420(const uint8_t* src_rgba,
--- a/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@ -245,19 +245,6 @@ int ARGBToI422(const uint8_t* src_argb,
               int width,
               int height);

-// Convert ABGR To I422.
-LIBYUV_API
-int ABGRToI422(const uint8_t* src_abgr,
-               int src_stride_abgr,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_u,
-               int dst_stride_u,
-               uint8_t* dst_v,
-               int dst_stride_v,
-               int width,
-               int height);
-
 // RGB to I444 with matrix. See ArgbConstants at the top of this file for usage.
 LIBYUV_API
 int ARGBToI422Matrix(const uint8_t* src_argb,
--- a/include/libyuv/cpu_id.h
+++ b/include/libyuv/cpu_id.h
@ -60,6 +60,7 @@ static const int kCpuHasAVX10_2 = 0x2000000;
 static const int kCpuHasAVXVNNI = 0x4000000;
 static const int kCpuHasAVXVNNIINT8 = 0x8000000;
 static const int kCpuHasAMXINT8 = 0x10000000;
+static const int kCpuHasAVX512BMM = 0x20000000;

 // These flags are only valid on LOONGARCH processors.
 static const int kCpuHasLOONGARCH = 0x20;
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
--- a/include/libyuv/row_sve.h
+++ b/include/libyuv/row_sve.h
@ -631,8 +631,8 @@ static inline void I422ToRGB565Row_SVE_SC(
      // Calculate a predicate for the final iteration to deal with the tail.
      "cnth     %[vl]                                   \n"
      "whilelt  p1.b, wzr, %w[width]                    \n"  //
-      READYUV422_SVE_2X I422TORGB_SVE_2X
-          RGBTOARGB8_SVE_TOP_2X RGB8TORGB565_SVE_FROM_TOP_2X
+      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
+          RGB8TORGB565_SVE_FROM_TOP_2X
      // Need to permute the data on the final iteration such that the
      // predicates (.b) line up with the 16-bit element data.
      "trn1     z20.b, z18.b, z19.b                     \n"
@ -694,8 +694,8 @@ static inline void I422ToARGB1555Row_SVE_SC(
      // Calculate a predicate for the final iteration to deal with the tail.
      "cnth     %[vl]                                   \n"
      "whilelt  p1.b, wzr, %w[width]                    \n"  //
-      READYUV422_SVE_2X I422TORGB_SVE_2X
-          RGBTOARGB8_SVE_TOP_2X RGB8TOARGB1555_SVE_FROM_TOP_2X
+      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
+          RGB8TOARGB1555_SVE_FROM_TOP_2X
      "st2h     {z0.h, z1.h}, p1, [%[dst]] \n"

      "99:                                              \n"
@ -753,8 +753,8 @@ static inline void I422ToARGB4444Row_SVE_SC(
      // Calculate a predicate for the final iteration to deal with the tail.
      "cnth     %[vl]                                   \n"
      "whilelt  p1.b, wzr, %w[width]                    \n"  //
-      READYUV422_SVE_2X I422TORGB_SVE_2X
-          RGBTOARGB8_SVE_TOP_2X RGB8TOARGB4444_SVE_FROM_TOP_2X
+      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
+          RGB8TOARGB4444_SVE_FROM_TOP_2X
      "st2h     {z0.h, z1.h}, p1, [%[dst]] \n"

      "99:                                              \n"
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1948
+#define LIBYUV_VERSION 1937

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/libyuv.gyp
+++ b/libyuv.gyp
@ -122,6 +122,18 @@
          'include',
          '.',
        ],
+        'conditions': [
+          ['OS == "android" and target_arch == "arm64"', {
+            'ldflags': [
+              '-Wl,--dynamic-linker,/system/bin/linker64',
+            ],
+          }],
+          ['OS == "android" and target_arch != "arm64"', {
+            'ldflags': [
+              '-Wl,--dynamic-linker,/system/bin/linker',
+            ],
+          }],
+        ], #conditions
      },
      'sources': [
        '<@(libyuv_sources)',
--- a/libyuv.gypi
+++ b/libyuv.gypi
@ -69,7 +69,6 @@
      'source/row_lsx.cc',
      'source/row_neon.cc',
      'source/row_neon64.cc',
-      'source/row_rvv.cc',
      'source/row_win.cc',
      'source/scale.cc',
      'source/scale_any.cc',
@ -80,7 +79,6 @@
      'source/scale_neon.cc',
      'source/scale_neon64.cc',
      'source/scale_rgb.cc',
-      'source/scale_rvv.cc',
      'source/scale_uv.cc',
      'source/scale_win.cc',
      'source/video_common.cc',
--- a/source/compare.cc
+++ b/source/compare.cc
@ -11,7 +11,6 @@
 #include "libyuv/compare.h"

 #include <float.h>
-#include <limits.h>
 #include <math.h>
 #ifdef _OPENMP
 #include <omp.h>
@ -107,11 +106,8 @@ uint32_t ARGBDetect(const uint8_t* argb,
  uint32_t fourcc = 0;
  int h;

-  if (!argb || width <= 0 || height <= 0) {
-    return fourcc;
-  }
  // Coalesce rows.
-  if (stride_argb == width * 4 && (ptrdiff_t)width * height <= INT_MAX) {
+  if (stride_argb == width * 4) {
    width *= height;
    height = 1;
    stride_argb = 0;
@ -249,12 +245,8 @@ uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
                                    int height) {
  uint64_t sse = 0;
  int h;
-  if (!src_a || !src_b || width <= 0 || height <= 0) {
-    return sse;
-  }
  // Coalesce rows.
-  if (stride_a == width && stride_b == width &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+  if (stride_a == width && stride_b == width) {
    width *= height;
    height = 1;
    stride_a = stride_b = 0;
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@ -116,7 +116,7 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
  uint32_t hash = seed;
  const uint32_t c16 = 0x92d9e201;  // 33^16
  uint32_t tmp, tmp2;
-  asm("ld1         {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
+      asm("ld1         {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
      "ld1         {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"

      // count is always a multiple of 16.
--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@ -41,9 +41,8 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
  return diff;
 }

-__declspec(naked) uint32_t SumSquareError_SSE2(const uint8_t* src_a,
-                                               const uint8_t* src_b,
-                                               int count) {
+__declspec(naked) uint32_t
+    SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
  __asm {
    mov        eax, [esp + 4]  // src_a
    mov        edx, [esp + 8]  // src_b
@ -82,9 +81,8 @@ __declspec(naked) uint32_t SumSquareError_SSE2(const uint8_t* src_a,
 #ifdef HAS_SUMSQUAREERROR_AVX2
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
 #pragma warning(disable : 4752)
-__declspec(naked) uint32_t SumSquareError_AVX2(const uint8_t* src_a,
-                                               const uint8_t* src_b,
-                                               int count) {
+__declspec(naked) uint32_t
+    SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
  __asm {
    mov        eax, [esp + 4]  // src_a
    mov        edx, [esp + 8]  // src_b
@ -148,9 +146,8 @@ uvec32 kHashMul3 = {
    0x00000001,  // 33 ^ 0
 };

-__declspec(naked) uint32_t HashDjb2_SSE41(const uint8_t* src,
-                                          int count,
-                                          uint32_t seed) {
+__declspec(naked) uint32_t
+    HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
  __asm {
    mov        eax, [esp + 4]  // src
    mov        ecx, [esp + 8]  // count
@ -200,9 +197,8 @@ __declspec(naked) uint32_t HashDjb2_SSE41(const uint8_t* src,

 // Visual C 2012 required for AVX2.
 #ifdef HAS_HASHDJB2_AVX2
-__declspec(naked) uint32_t HashDjb2_AVX2(const uint8_t* src,
-                                         int count,
-                                         uint32_t seed) {
+__declspec(naked) uint32_t
+    HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
  __asm {
    mov        eax, [esp + 4]  // src
    mov        ecx, [esp + 8]  // count
--- a/source/convert.cc
+++ b/source/convert.cc
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@ -10,8 +10,6 @@

 #include "libyuv/convert_from.h"

-#include <limits.h>
-
 #include "libyuv/basic_types.h"
 #include "libyuv/convert.h"  // For I420Copy
 #include "libyuv/cpu_id.h"
@ -89,16 +87,16 @@ int I420ToI010(const uint8_t* src_y,
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
    src_stride_y = -src_stride_y;
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
@ -134,16 +132,16 @@ int I420ToI012(const uint8_t* src_y,
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
    src_stride_y = -src_stride_y;
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
@ -228,7 +226,7 @@ int I010ToI410(const uint16_t* src_y,
               int height) {
  int r;
  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
    return -1;
  }

@ -265,7 +263,7 @@ int I210ToI410(const uint16_t* src_y,
               int height) {
  int r;
  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
    return -1;
  }

@ -301,7 +299,7 @@ int I422ToI444(const uint8_t* src_y,
               int height) {
  int r;
  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
    return -1;
  }

@ -326,7 +324,7 @@ int I400Copy(const uint8_t* src_y,
             int dst_stride_y,
             int width,
             int height) {
-  if (!src_y || !dst_y || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
    return -1;
  }
  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
@ -348,20 +346,18 @@ int I422ToYUY2(const uint8_t* src_y,
  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
      I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    dst_yuy2 = dst_yuy2 + (ptrdiff_t)(height - 1) * dst_stride_yuy2;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
    dst_stride_yuy2 = -dst_stride_yuy2;
  }
  // Coalesce rows.
  if (src_stride_y == width && src_stride_u * 2 == width &&
-      src_stride_v * 2 == width && dst_stride_yuy2 == width * 2 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+      src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
    width *= height;
    height = 1;
    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
@ -416,14 +412,13 @@ int I420ToYUY2(const uint8_t* src_y,
  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
      I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    dst_yuy2 = dst_yuy2 + (ptrdiff_t)(height - 1) * dst_stride_yuy2;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
    dst_stride_yuy2 = -dst_stride_yuy2;
  }
 #if defined(HAS_I422TOYUY2ROW_SSE2)
@ -497,20 +492,18 @@ int I422ToUYVY(const uint8_t* src_y,
  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
      I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    dst_uyvy = dst_uyvy + (ptrdiff_t)(height - 1) * dst_stride_uyvy;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
    dst_stride_uyvy = -dst_stride_uyvy;
  }
  // Coalesce rows.
  if (src_stride_y == width && src_stride_u * 2 == width &&
-      src_stride_v * 2 == width && dst_stride_uyvy == width * 2 &&
-      (ptrdiff_t)width * height <= INT_MAX) {
+      src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
    width *= height;
    height = 1;
    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
@ -581,14 +574,13 @@ int I420ToUYVY(const uint8_t* src_y,
  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
      I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    dst_uyvy = dst_uyvy + (ptrdiff_t)(height - 1) * dst_stride_uyvy;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
    dst_stride_uyvy = -dst_stride_uyvy;
  }
 #if defined(HAS_I422TOUYVYROW_SSE2)
@ -663,16 +655,16 @@ int I420ToNV12(const uint8_t* src_y,
  int halfwidth = (width + 1) / 2;
  int halfheight = (height + 1) / 2;
  if ((!src_y && dst_y) || !src_u || !src_v || !dst_uv || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
    src_stride_y = -src_stride_y;
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
@ -718,8 +710,7 @@ int ConvertFromI420(const uint8_t* y,
                    uint32_t fourcc) {
  uint32_t format = CanonicalFourCC(fourcc);
  int r = 0;
-  if (!y || !u || !v || !dst_sample || width <= 0 || height == 0 ||
-      height == INT_MIN) {
+  if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
    return -1;
  }
  switch (format) {
@ -791,7 +782,7 @@ int ConvertFromI420(const uint8_t* y,
      break;
    case FOURCC_NV12: {
      int dst_y_stride = dst_sample_stride ? dst_sample_stride : width;
-      uint8_t* dst_uv = dst_sample + (ptrdiff_t)dst_y_stride * height;
+      uint8_t* dst_uv = dst_sample + dst_y_stride * height;
      r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                     dst_sample_stride ? dst_sample_stride : width, dst_uv,
                     dst_sample_stride ? dst_sample_stride : width, width,
@ -800,7 +791,7 @@ int ConvertFromI420(const uint8_t* y,
    }
    case FOURCC_NV21: {
      int dst_y_stride = dst_sample_stride ? dst_sample_stride : width;
-      uint8_t* dst_vu = dst_sample + (ptrdiff_t)dst_y_stride * height;
+      uint8_t* dst_vu = dst_sample + dst_y_stride * height;
      r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                     dst_sample_stride ? dst_sample_stride : width, dst_vu,
                     dst_sample_stride ? dst_sample_stride : width, width,
@ -816,11 +807,11 @@ int ConvertFromI420(const uint8_t* y,
      uint8_t* dst_u;
      uint8_t* dst_v;
      if (format == FOURCC_YV12) {
-        dst_v = dst_sample + (ptrdiff_t)dst_sample_stride * height;
-        dst_u = dst_v + (ptrdiff_t)halfstride * halfheight;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * halfheight;
      } else {
-        dst_u = dst_sample + (ptrdiff_t)dst_sample_stride * height;
-        dst_v = dst_u + (ptrdiff_t)halfstride * halfheight;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * halfheight;
      }
      r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                   dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
@ -834,11 +825,11 @@ int ConvertFromI420(const uint8_t* y,
      uint8_t* dst_u;
      uint8_t* dst_v;
      if (format == FOURCC_YV16) {
-        dst_v = dst_sample + (ptrdiff_t)dst_sample_stride * height;
-        dst_u = dst_v + (ptrdiff_t)halfstride * height;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * height;
      } else {
-        dst_u = dst_sample + (ptrdiff_t)dst_sample_stride * height;
-        dst_v = dst_u + (ptrdiff_t)halfstride * height;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * height;
      }
      r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                     dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
@ -851,11 +842,11 @@ int ConvertFromI420(const uint8_t* y,
      uint8_t* dst_u;
      uint8_t* dst_v;
      if (format == FOURCC_YV24) {
-        dst_v = dst_sample + (ptrdiff_t)dst_sample_stride * height;
-        dst_u = dst_v + (ptrdiff_t)dst_sample_stride * height;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + dst_sample_stride * height;
      } else {
-        dst_u = dst_sample + (ptrdiff_t)dst_sample_stride * height;
-        dst_v = dst_u + (ptrdiff_t)dst_sample_stride * height;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + dst_sample_stride * height;
      }
      r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                     dst_sample_stride, dst_u, dst_sample_stride, dst_v,
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
--- a/source/convert_to_argb.cc
+++ b/source/convert_to_argb.cc
@ -11,7 +11,6 @@
 #include "libyuv/convert_argb.h"

 #include <limits.h>
-#include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>

@ -51,26 +50,12 @@ int ConvertToARGB(const uint8_t* sample,
                  int crop_height,
                  enum RotationMode rotation,
                  uint32_t fourcc) {
-  if (src_height == INT_MIN || crop_height == INT_MIN) {
-    return -1;
-  }
-
-  int abs_src_height = (src_height < 0) ? -src_height : src_height;
-  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
-
-  if (dst_argb == NULL || sample == NULL || src_width <= 0 ||
-      src_width > INT_MAX / 4 || crop_width <= 0 || crop_width > INT_MAX / 4 ||
-      src_height == 0 || crop_height == 0 || crop_x < 0 || crop_y < 0 ||
-      crop_width > src_width || crop_x > src_width - crop_width ||
-      abs_crop_height > abs_src_height ||
-      crop_y > abs_src_height - abs_crop_height) {
-    return -1;
-  }
-
  uint32_t format = CanonicalFourCC(fourcc);
  int aligned_src_width = (src_width + 1) & ~1;
  const uint8_t* src;
  const uint8_t* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
  int r = 0;

  // One pass rotation is available for some formats. For the rest, convert
@ -83,8 +68,13 @@ int ConvertToARGB(const uint8_t* sample,
  uint8_t* dest_argb = dst_argb;
  int dest_dst_stride_argb = dst_stride_argb;
  uint8_t* rotate_buffer = NULL;
-  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;

+  if (dst_argb == NULL || sample == NULL || src_width <= 0 ||
+      src_width > INT_MAX / 4 || crop_width <= 0 || crop_width > INT_MAX / 4 ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
  if (src_height < 0) {
    inv_crop_height = -inv_crop_height;
  }
@ -106,97 +96,95 @@ int ConvertToARGB(const uint8_t* sample,
  switch (format) {
    // Single plane formats
    case FOURCC_YUY2:
-      src = sample + ((ptrdiff_t)aligned_src_width * crop_y + crop_x) * 2;
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
      r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
                     crop_width, inv_crop_height);
      break;
    case FOURCC_UYVY:
-      src = sample + ((ptrdiff_t)aligned_src_width * crop_y + crop_x) * 2;
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
      r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
                     crop_width, inv_crop_height);
      break;
    case FOURCC_24BG:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 3;
+      src = sample + (src_width * crop_y + crop_x) * 3;
      r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
                      inv_crop_height);
      break;
    case FOURCC_RAW:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 3;
+      src = sample + (src_width * crop_y + crop_x) * 3;
      r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
                    inv_crop_height);
      break;
    case FOURCC_ARGB:
      if (!need_buf && !rotation) {
-        src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+        src = sample + (src_width * crop_y + crop_x) * 4;
        r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb,
                       crop_width, inv_crop_height);
      }
      break;
    case FOURCC_BGRA:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
      r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
                     inv_crop_height);
      break;
    case FOURCC_ABGR:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
      r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
                     inv_crop_height);
      break;
    case FOURCC_RGBA:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
      r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
                     inv_crop_height);
      break;
    case FOURCC_AR30:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
      r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
                     inv_crop_height);
      break;
    case FOURCC_AB30:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
      r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
                     inv_crop_height);
      break;
    case FOURCC_RGBP:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 2;
+      src = sample + (src_width * crop_y + crop_x) * 2;
      r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                       crop_width, inv_crop_height);
      break;
    case FOURCC_RGBO:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 2;
+      src = sample + (src_width * crop_y + crop_x) * 2;
      r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                         crop_width, inv_crop_height);
      break;
    case FOURCC_R444:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 2;
+      src = sample + (src_width * crop_y + crop_x) * 2;
      r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                         crop_width, inv_crop_height);
      break;
    case FOURCC_I400:
-      src = sample + (ptrdiff_t)src_width * crop_y + crop_x;
+      src = sample + src_width * crop_y + crop_x;
      r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
                     inv_crop_height);
      break;
    case FOURCC_J400:
-      src = sample + (ptrdiff_t)src_width * crop_y + crop_x;
+      src = sample + src_width * crop_y + crop_x;
      r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
                     inv_crop_height);
      break;

    // Biplanar formats
    case FOURCC_NV12:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
-      src_uv = sample +
-               aligned_src_width * ((ptrdiff_t)abs_src_height + crop_y / 2) +
-               crop_x;
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv =
+          sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
      r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
                     dst_stride_argb, crop_width, inv_crop_height);
      break;
    case FOURCC_NV21:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
-      src_uv = sample +
-               aligned_src_width * ((ptrdiff_t)abs_src_height + crop_y / 2) +
-               crop_x;
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv =
+          sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
      // Call NV12 but with u and v parameters swapped.
      r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
                     dst_stride_argb, crop_width, inv_crop_height);
@ -204,21 +192,21 @@ int ConvertToARGB(const uint8_t* sample,
    // Triplanar formats
    case FOURCC_I420:
    case FOURCC_YV12: {
-      const uint8_t* src_y = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
      const uint8_t* src_u;
      const uint8_t* src_v;
      int halfwidth = (src_width + 1) / 2;
      int halfheight = (abs_src_height + 1) / 2;
      if (format == FOURCC_YV12) {
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                ((ptrdiff_t)halfwidth * crop_y + crop_x) / 2;
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)halfheight + crop_y / 2) + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+                (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
      } else {
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                ((ptrdiff_t)halfwidth * crop_y + crop_x) / 2;
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)halfheight + crop_y / 2) + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+                (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
      }
      r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
@ -228,12 +216,11 @@ int ConvertToARGB(const uint8_t* sample,
    case FOURCC_J420: {
      int halfwidth = (src_width + 1) / 2;
      int halfheight = (abs_src_height + 1) / 2;
-      const uint8_t* src_y = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
-      const uint8_t* src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                             ((ptrdiff_t)halfwidth * crop_y + crop_x) / 2;
-      const uint8_t* src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                             halfwidth * ((ptrdiff_t)halfheight + crop_y / 2) +
-                             crop_x / 2;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u = sample + src_width * abs_src_height +
+                             (halfwidth * crop_y + crop_x) / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
      r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
      break;
@ -242,12 +229,11 @@ int ConvertToARGB(const uint8_t* sample,
    case FOURCC_H420: {
      int halfwidth = (src_width + 1) / 2;
      int halfheight = (abs_src_height + 1) / 2;
-      const uint8_t* src_y = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
-      const uint8_t* src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                             ((ptrdiff_t)halfwidth * crop_y + crop_x) / 2;
-      const uint8_t* src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                             halfwidth * ((ptrdiff_t)halfheight + crop_y / 2) +
-                             crop_x / 2;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u = sample + src_width * abs_src_height +
+                             (halfwidth * crop_y + crop_x) / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
      r = H420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
      break;
@ -256,12 +242,11 @@ int ConvertToARGB(const uint8_t* sample,
    case FOURCC_U420: {
      int halfwidth = (src_width + 1) / 2;
      int halfheight = (abs_src_height + 1) / 2;
-      const uint8_t* src_y = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
-      const uint8_t* src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                             ((ptrdiff_t)halfwidth * crop_y + crop_x) / 2;
-      const uint8_t* src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                             halfwidth * ((ptrdiff_t)halfheight + crop_y / 2) +
-                             crop_x / 2;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u = sample + src_width * abs_src_height +
+                             (halfwidth * crop_y + crop_x) / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
      r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
      break;
@ -270,19 +255,19 @@ int ConvertToARGB(const uint8_t* sample,
    case FOURCC_I422:
    case FOURCC_YV16: {
      int halfwidth = (src_width + 1) / 2;
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
      const uint8_t* src_u;
      const uint8_t* src_v;
      if (format == FOURCC_YV16) {
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                (ptrdiff_t)halfwidth * crop_y + crop_x / 2;
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)abs_src_height + crop_y) + crop_x / 2;
+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
      } else {
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                (ptrdiff_t)halfwidth * crop_y + crop_x / 2;
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)abs_src_height + crop_y) + crop_x / 2;
+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
      }
      r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
@ -291,12 +276,11 @@ int ConvertToARGB(const uint8_t* sample,

    case FOURCC_J422: {
      int halfwidth = (src_width + 1) / 2;
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
-      const uint8_t* src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                             (ptrdiff_t)halfwidth * crop_y + crop_x / 2;
-      const uint8_t* src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                             halfwidth * ((ptrdiff_t)abs_src_height + crop_y) +
-                             crop_x / 2;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u =
+          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
      r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
      break;
@ -304,12 +288,11 @@ int ConvertToARGB(const uint8_t* sample,

    case FOURCC_H422: {
      int halfwidth = (src_width + 1) / 2;
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
-      const uint8_t* src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                             (ptrdiff_t)halfwidth * crop_y + crop_x / 2;
-      const uint8_t* src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                             halfwidth * ((ptrdiff_t)abs_src_height + crop_y) +
-                             crop_x / 2;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u =
+          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
      r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
      break;
@ -317,12 +300,11 @@ int ConvertToARGB(const uint8_t* sample,

    case FOURCC_U422: {
      int halfwidth = (src_width + 1) / 2;
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
-      const uint8_t* src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                             (ptrdiff_t)halfwidth * crop_y + crop_x / 2;
-      const uint8_t* src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                             halfwidth * ((ptrdiff_t)abs_src_height + crop_y) +
-                             crop_x / 2;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u =
+          sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+      const uint8_t* src_v = sample + src_width * abs_src_height +
+                             halfwidth * (abs_src_height + crop_y) + crop_x / 2;
      r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
      break;
@ -330,19 +312,15 @@ int ConvertToARGB(const uint8_t* sample,

    case FOURCC_I444:
    case FOURCC_YV24: {
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
      const uint8_t* src_u;
      const uint8_t* src_v;
      if (format == FOURCC_YV24) {
-        src_v =
-            sample + src_width * ((ptrdiff_t)abs_src_height + crop_y) + crop_x;
-        src_u = sample + src_width * ((ptrdiff_t)abs_src_height * 2 + crop_y) +
-                crop_x;
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
      } else {
-        src_u =
-            sample + src_width * ((ptrdiff_t)abs_src_height + crop_y) + crop_x;
-        src_v = sample + src_width * ((ptrdiff_t)abs_src_height * 2 + crop_y) +
-                crop_x;
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
      }
      r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
@ -350,36 +328,33 @@ int ConvertToARGB(const uint8_t* sample,
    }

    case FOURCC_J444: {
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
-      const uint8_t* src_u =
-          sample + src_width * ((ptrdiff_t)abs_src_height + crop_y) + crop_x;
-      const uint8_t* src_v =
-          sample + src_width * ((ptrdiff_t)abs_src_height * 2 + crop_y) +
-          crop_x;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
      r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
      break;
    }

    case FOURCC_H444: {
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
-      const uint8_t* src_u =
-          sample + src_width * ((ptrdiff_t)abs_src_height + crop_y) + crop_x;
-      const uint8_t* src_v =
-          sample + src_width * ((ptrdiff_t)abs_src_height * 2 + crop_y) +
-          crop_x;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
      r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
      break;
    }

    case FOURCC_U444: {
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
-      const uint8_t* src_u =
-          sample + src_width * ((ptrdiff_t)abs_src_height + crop_y) + crop_x;
-      const uint8_t* src_v =
-          sample + src_width * ((ptrdiff_t)abs_src_height * 2 + crop_y) +
-          crop_x;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
+      src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+      src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
      r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
      break;
@ -402,7 +377,7 @@ int ConvertToARGB(const uint8_t* sample,
    }
    free(rotate_buffer);
  } else if (rotation) {
-    src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+    src = sample + (src_width * crop_y + crop_x) * 4;
    r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
                   inv_crop_height, rotation);
  }
--- a/source/convert_to_i420.cc
+++ b/source/convert_to_i420.cc
@ -44,24 +44,12 @@ int ConvertToI420(const uint8_t* sample,
                  int crop_height,
                  enum RotationMode rotation,
                  uint32_t fourcc) {
-  if (src_height == INT_MIN || crop_height == INT_MIN) {
-    return -1;
-  }
-
-  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
-  const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
-
-  if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
-      src_width > INT_MAX / 4 || crop_width <= 0 || src_height == 0 ||
-      crop_height == 0 || crop_x < 0 || crop_y < 0 || crop_width > src_width ||
-      crop_x > src_width - crop_width || abs_crop_height > abs_src_height ||
-      crop_y > abs_src_height - abs_crop_height) {
-    return -1;
-  }
-
  uint32_t format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
  const uint8_t* src;
  const uint8_t* src_uv;
+  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
  int r = 0;
  LIBYUV_BOOL need_buf =
      (rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
@ -76,7 +64,12 @@ int ConvertToI420(const uint8_t* sample,
  uint8_t* rotate_buffer = NULL;
  const int inv_crop_height =
      (src_height < 0) ? -abs_crop_height : abs_crop_height;
-  int aligned_src_width = (src_width + 1) & ~1;
+
+  if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
+      src_width > INT_MAX / 4 || crop_width <= 0 || src_height == 0 ||
+      crop_height == 0) {
+    return -1;
+  }

  // One pass rotation is available for some formats. For the rest, convert
  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
@ -84,14 +77,14 @@ int ConvertToI420(const uint8_t* sample,
  // For in-place conversion, if destination dst_y is same as source sample,
  // also enable temporary buffer.
  if (need_buf) {
-    size_t y_size = (size_t)crop_width * abs_crop_height;
-    size_t uv_size =
-        (size_t)((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
-    if (uv_size > SIZE_MAX / 2 || y_size > SIZE_MAX - uv_size * 2) {
+    int y_size = crop_width * abs_crop_height;
+    int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
+    const uint64_t rotate_buffer_size =
+        (uint64_t)y_size + (uint64_t)uv_size * 2;
+    if (rotate_buffer_size > SIZE_MAX) {
      return -1;  // Invalid size.
    }
-    const size_t rotate_buffer_size = y_size + uv_size * 2;
-    rotate_buffer = (uint8_t*)malloc(rotate_buffer_size);
+    rotate_buffer = (uint8_t*)malloc((size_t)rotate_buffer_size);
    if (!rotate_buffer) {
      return 1;  // Out of memory runtime error.
    }
@ -109,7 +102,7 @@ int ConvertToI420(const uint8_t* sample,
      uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
      int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
      int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
-      src = sample + ((ptrdiff_t)aligned_src_width * crop_y + crop_x) * 2;
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
      r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
                     stride_u, v, stride_v, crop_width, inv_crop_height);
      break;
@ -119,86 +112,84 @@ int ConvertToI420(const uint8_t* sample,
      uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
      int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
      int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
-      src = sample + ((ptrdiff_t)aligned_src_width * crop_y + crop_x) * 2;
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
      r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
                     stride_u, v, stride_v, crop_width, inv_crop_height);
      break;
    }
    case FOURCC_RGBP:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 2;
+      src = sample + (src_width * crop_y + crop_x) * 2;
      r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
                       dst_stride_u, dst_v, dst_stride_v, crop_width,
                       inv_crop_height);
      break;
    case FOURCC_RGBO:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 2;
+      src = sample + (src_width * crop_y + crop_x) * 2;
      r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
                         dst_stride_u, dst_v, dst_stride_v, crop_width,
                         inv_crop_height);
      break;
    case FOURCC_R444:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 2;
+      src = sample + (src_width * crop_y + crop_x) * 2;
      r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
                         dst_stride_u, dst_v, dst_stride_v, crop_width,
                         inv_crop_height);
      break;
    case FOURCC_24BG:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 3;
+      src = sample + (src_width * crop_y + crop_x) * 3;
      r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
                      dst_stride_u, dst_v, dst_stride_v, crop_width,
                      inv_crop_height);
      break;
    case FOURCC_RAW:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 3;
+      src = sample + (src_width * crop_y + crop_x) * 3;
      r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
                    dst_stride_u, dst_v, dst_stride_v, crop_width,
                    inv_crop_height);
      break;
    case FOURCC_ARGB:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
      r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
                     dst_stride_u, dst_v, dst_stride_v, crop_width,
                     inv_crop_height);
      break;
    case FOURCC_BGRA:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
      r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
                     dst_stride_u, dst_v, dst_stride_v, crop_width,
                     inv_crop_height);
      break;
    case FOURCC_ABGR:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
      r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
                     dst_stride_u, dst_v, dst_stride_v, crop_width,
                     inv_crop_height);
      break;
    case FOURCC_RGBA:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x) * 4;
+      src = sample + (src_width * crop_y + crop_x) * 4;
      r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
                     dst_stride_u, dst_v, dst_stride_v, crop_width,
                     inv_crop_height);
      break;
    // TODO(fbarchard): Add AR30 and AB30
    case FOURCC_I400:
-      src = sample + (ptrdiff_t)src_width * crop_y + crop_x;
+      src = sample + src_width * crop_y + crop_x;
      r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
                     dst_v, dst_stride_v, crop_width, inv_crop_height);
      break;
    // Biplanar formats
    case FOURCC_NV12:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
-      src_uv = sample + ((ptrdiff_t)src_width * abs_src_height) +
-               ((ptrdiff_t)(crop_y / 2) * aligned_src_width) +
-               ((crop_x / 2) * 2);
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * abs_src_height) +
+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
                           dst_stride_y, dst_u, dst_stride_u, dst_v,
                           dst_stride_v, crop_width, inv_crop_height, rotation);
      break;
    case FOURCC_NV21:
-      src = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
-      src_uv = sample + ((ptrdiff_t)src_width * abs_src_height) +
-               ((ptrdiff_t)(crop_y / 2) * aligned_src_width) +
-               ((crop_x / 2) * 2);
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * abs_src_height) +
+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
      // Call NV12 but with dst_u and dst_v parameters swapped.
      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
                           dst_stride_y, dst_v, dst_stride_v, dst_u,
@ -207,23 +198,21 @@ int ConvertToI420(const uint8_t* sample,
    // Triplanar formats
    case FOURCC_I420:
    case FOURCC_YV12: {
-      const uint8_t* src_y = sample + ((ptrdiff_t)src_width * crop_y + crop_x);
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
      const uint8_t* src_u;
      const uint8_t* src_v;
      int halfwidth = (src_width + 1) / 2;
      int halfheight = (abs_src_height + 1) / 2;
      if (format == FOURCC_YV12) {
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                (ptrdiff_t)halfwidth * (crop_y / 2) + (crop_x / 2);
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)halfheight + (crop_y / 2)) +
+        src_v = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
                (crop_x / 2);
+        src_u = sample + src_width * abs_src_height +
+                halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
      } else {
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                (ptrdiff_t)halfwidth * (crop_y / 2) + (crop_x / 2);
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)halfheight + (crop_y / 2)) +
+        src_u = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
                (crop_x / 2);
+        src_v = sample + src_width * abs_src_height +
+                halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
      }
      r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
@ -232,20 +221,20 @@ int ConvertToI420(const uint8_t* sample,
    }
    case FOURCC_I422:
    case FOURCC_YV16: {
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
      const uint8_t* src_u;
      const uint8_t* src_v;
      int halfwidth = (src_width + 1) / 2;
      if (format == FOURCC_YV16) {
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                (ptrdiff_t)halfwidth * crop_y + (crop_x / 2);
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)abs_src_height + crop_y) + (crop_x / 2);
+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+                (crop_x / 2);
+        src_u = sample + src_width * abs_src_height +
+                halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
      } else {
-        src_u = sample + (ptrdiff_t)src_width * abs_src_height +
-                (ptrdiff_t)halfwidth * crop_y + (crop_x / 2);
-        src_v = sample + (ptrdiff_t)src_width * abs_src_height +
-                halfwidth * ((ptrdiff_t)abs_src_height + crop_y) + (crop_x / 2);
+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+                (crop_x / 2);
+        src_v = sample + src_width * abs_src_height +
+                halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
      }
      r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
@ -254,19 +243,15 @@ int ConvertToI420(const uint8_t* sample,
    }
    case FOURCC_I444:
    case FOURCC_YV24: {
-      const uint8_t* src_y = sample + (ptrdiff_t)src_width * crop_y + crop_x;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
      const uint8_t* src_u;
      const uint8_t* src_v;
      if (format == FOURCC_YV24) {
-        src_v =
-            sample + src_width * ((ptrdiff_t)abs_src_height + crop_y) + crop_x;
-        src_u = sample + src_width * ((ptrdiff_t)abs_src_height * 2 + crop_y) +
-                crop_x;
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
      } else {
-        src_u =
-            sample + src_width * ((ptrdiff_t)abs_src_height + crop_y) + crop_x;
-        src_v = sample + src_width * ((ptrdiff_t)abs_src_height * 2 + crop_y) +
-                crop_x;
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
      }
      r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width,
                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@ -397,6 +397,7 @@ static SAFEBUFFERS int GetCpuFlags(void) {
  int cpu_info7[4] = {0, 0, 0, 0};
  int cpu_einfo7[4] = {0, 0, 0, 0};
  int cpu_info24[4] = {0, 0, 0, 0};
+  int cpu_info21[4] = {0, 0, 0, 0};
  int cpu_amdinfo21[4] = {0, 0, 0, 0};
  CpuId(0, 0, cpu_info0);
  CpuId(1, 0, cpu_info1);
@ -405,6 +406,9 @@ static SAFEBUFFERS int GetCpuFlags(void) {
    CpuId(7, 1, cpu_einfo7);
    CpuId(0x80000021, 0, cpu_amdinfo21);
  }
+  if (cpu_info0[0] >= 0x21) {
+    CpuId(0x21, 0, cpu_info21);
+  }
  if (cpu_info0[0] >= 0x24) {
    CpuId(0x24, 0, cpu_info24);
  }
@ -435,7 +439,8 @@ static SAFEBUFFERS int GetCpuFlags(void) {
                  ((cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0) |
                  ((cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0) |
                  ((cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0) |
-                  ((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0);
+                  ((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0) |
+                  ((cpu_info21[0] & 0x00800000) ? kCpuHasAVX512BMM : 0);
      if (cpu_info0[0] >= 0x24 && (cpu_einfo7[3] & 0x00080000)) {
        cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2 : 0;
      }
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -8,10 +8,9 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "libyuv/rotate.h"
-
 #include <assert.h>
-#include <limits.h>
+
+#include "libyuv/rotate.h"

 #include "libyuv/convert.h"
 #include "libyuv/cpu_id.h"
@ -129,7 +128,7 @@ void RotatePlane90(const uint8_t* src,
  // Rotate by 90 is a transpose with the source read
  // from bottom to top. So set the source pointer to the end
  // of the buffer and flip the sign of the source stride.
-  src += (ptrdiff_t)src_stride * (height - 1);
+  src += src_stride * (height - 1);
  src_stride = -src_stride;
  TransposePlane(src, src_stride, dst, dst_stride, width, height);
 }
@ -144,7 +143,7 @@ void RotatePlane270(const uint8_t* src,
  // Rotate by 270 is a transpose with the destination written
  // from bottom to top. So set the destination pointer to the end
  // of the buffer and flip the sign of the destination stride.
-  dst += (ptrdiff_t)dst_stride * (width - 1);
+  dst += dst_stride * (width - 1);
  dst_stride = -dst_stride;
  TransposePlane(src, src_stride, dst, dst_stride, width, height);
 }
@ -161,8 +160,8 @@ void RotatePlane180(const uint8_t* src,
  assert(row);
  if (!row)
    return;
-  const uint8_t* src_bot = src + (ptrdiff_t)src_stride * (height - 1);
-  uint8_t* dst_bot = dst + (ptrdiff_t)dst_stride * (height - 1);
+  const uint8_t* src_bot = src + src_stride * (height - 1);
+  uint8_t* dst_bot = dst + dst_stride * (height - 1);
  int half_height = (height + 1) >> 1;
  int y;
  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
@ -355,7 +354,7 @@ void SplitRotateUV90(const uint8_t* src,
                     int dst_stride_b,
                     int width,
                     int height) {
-  src += (ptrdiff_t)src_stride * (height - 1);
+  src += src_stride * (height - 1);
  src_stride = -src_stride;

  SplitTransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
@ -398,14 +397,9 @@ void SplitRotateUV180(const uint8_t* src,
    MirrorSplitUVRow = MirrorSplitUVRow_NEON;
  }
 #endif
-#if defined(HAS_MIRRORSPLITUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    MirrorSplitUVRow = MirrorSplitUVRow_AVX2;
-  }
-#endif
-#if defined(HAS_MIRRORSPLITUVROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW) && IS_ALIGNED(width, 32)) {
-    MirrorSplitUVRow = MirrorSplitUVRow_AVX512BW;
+#if defined(HAS_MIRRORSPLITUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+    MirrorSplitUVRow = MirrorSplitUVRow_SSSE3;
  }
 #endif
 #if defined(HAS_MIRRORSPLITUVROW_LSX)
@ -437,15 +431,14 @@ int SplitRotateUV(const uint8_t* src_uv,
                  int width,
                  int height,
                  enum RotationMode mode) {
-  if (!src_uv || width <= 0 || height == 0 || height == INT_MIN || !dst_u ||
-      !dst_v) {
+  if (!src_uv || width <= 0 || height == 0 || !dst_u || !dst_v) {
    return -1;
  }

  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    src_uv = src_uv + (ptrdiff_t)(height - 1) * src_stride_uv;
+    src_uv = src_uv + (height - 1) * src_stride_uv;
    src_stride_uv = -src_stride_uv;
  }

@ -480,14 +473,14 @@ int RotatePlane(const uint8_t* src,
                int width,
                int height,
                enum RotationMode mode) {
-  if (!src || width <= 0 || height == 0 || height == INT_MIN || !dst) {
+  if (!src || width <= 0 || height == 0 || !dst) {
    return -1;
  }

  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    src = src + (ptrdiff_t)(height - 1) * src_stride;
+    src = src + (height - 1) * src_stride;
    src_stride = -src_stride;
  }

@ -540,7 +533,7 @@ static void RotatePlane90_16(const uint16_t* src,
  // Rotate by 90 is a transpose with the source read
  // from bottom to top. So set the source pointer to the end
  // of the buffer and flip the sign of the source stride.
-  src += (ptrdiff_t)src_stride * (height - 1);
+  src += src_stride * (height - 1);
  src_stride = -src_stride;
  TransposePlane_16(src, src_stride, dst, dst_stride, width, height);
 }
@ -554,7 +547,7 @@ static void RotatePlane270_16(const uint16_t* src,
  // Rotate by 270 is a transpose with the destination written
  // from bottom to top. So set the destination pointer to the end
  // of the buffer and flip the sign of the destination stride.
-  dst += (ptrdiff_t)dst_stride * (width - 1);
+  dst += dst_stride * (width - 1);
  dst_stride = -dst_stride;
  TransposePlane_16(src, src_stride, dst, dst_stride, width, height);
 }
@ -565,8 +558,8 @@ static void RotatePlane180_16(const uint16_t* src,
                              int dst_stride,
                              int width,
                              int height) {
-  const uint16_t* src_bot = src + (ptrdiff_t)src_stride * (height - 1);
-  uint16_t* dst_bot = dst + (ptrdiff_t)dst_stride * (height - 1);
+  const uint16_t* src_bot = src + src_stride * (height - 1);
+  uint16_t* dst_bot = dst + dst_stride * (height - 1);
  int half_height = (height + 1) >> 1;
  int y;

@ -598,14 +591,14 @@ int RotatePlane_16(const uint16_t* src,
                   int width,
                   int height,
                   enum RotationMode mode) {
-  if (!src || width <= 0 || height == 0 || height == INT_MIN || !dst) {
+  if (!src || width <= 0 || height == 0 || !dst) {
    return -1;
  }

  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    src = src + (ptrdiff_t)(height - 1) * src_stride;
+    src = src + (height - 1) * src_stride;
    src_stride = -src_stride;
  }

@ -648,7 +641,7 @@ int I420Rotate(const uint8_t* src_y,
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
  if ((!src_y && dst_y) || !src_u || !src_v || width <= 0 || height == 0 ||
-      height == INT_MIN || !dst_y || !dst_u || !dst_v) {
+      !dst_y || !dst_u || !dst_v) {
    return -1;
  }

@ -656,9 +649,9 @@ int I420Rotate(const uint8_t* src_y,
  if (height < 0) {
    height = -height;
    halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
    src_stride_y = -src_stride_y;
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
@ -718,16 +711,16 @@ int I422Rotate(const uint8_t* src_y,
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
  int r;
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      height == INT_MIN || !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
    src_stride_y = -src_stride_y;
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
@ -813,17 +806,17 @@ int I444Rotate(const uint8_t* src_y,
               int width,
               int height,
               enum RotationMode mode) {
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      height == INT_MIN || !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
    return -1;
  }

  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
    src_stride_y = -src_stride_y;
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
@ -873,8 +866,8 @@ int NV12ToI420Rotate(const uint8_t* src_y,
                     enum RotationMode mode) {
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_uv || width <= 0 || height == 0 || height == INT_MIN ||
-      !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
+      !dst_v) {
    return -1;
  }

@ -882,8 +875,8 @@ int NV12ToI420Rotate(const uint8_t* src_y,
  if (height < 0) {
    height = -height;
    halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_uv = src_uv + (ptrdiff_t)(halfheight - 1) * src_stride_uv;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
    src_stride_y = -src_stride_y;
    src_stride_uv = -src_stride_uv;
  }
@ -950,16 +943,16 @@ int Android420ToI420Rotate(const uint8_t* src_y,
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
-      height == 0 || height == INT_MIN) {
+      height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    halfheight = (height + 1) >> 1;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(halfheight - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(halfheight - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
    src_stride_y = -src_stride_y;
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
@ -1025,16 +1018,16 @@ int I010Rotate(const uint16_t* src_y,
               enum RotationMode mode) {
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_stride_y < 0) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v || dst_stride_y < 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
    src_stride_y = -src_stride_y;
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
@ -1096,16 +1089,16 @@ int I210Rotate(const uint16_t* src_y,
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
  int r;
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      height == INT_MIN || !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
    src_stride_y = -src_stride_y;
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
@ -1193,16 +1186,16 @@ int I410Rotate(const uint16_t* src_y,
               int width,
               int height,
               enum RotationMode mode) {
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_stride_y < 0) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v || dst_stride_y < 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    src_y = src_y + (ptrdiff_t)(height - 1) * src_stride_y;
-    src_u = src_u + (ptrdiff_t)(height - 1) * src_stride_u;
-    src_v = src_v + (ptrdiff_t)(height - 1) * src_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
    src_stride_y = -src_stride_y;
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@ -10,8 +10,6 @@

 #include "libyuv/rotate_argb.h"

-#include <limits.h>
-
 #include "libyuv/convert.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
@ -224,15 +222,14 @@ int ARGBRotate(const uint8_t* src_argb,
               int width,
               int height,
               enum RotationMode mode) {
-  if (!src_argb || width <= 0 || height == 0 || height == INT_MIN ||
-      !dst_argb) {
+  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
    return -1;
  }

  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    src_argb = src_argb + (ptrdiff_t)(height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
    src_stride_argb = -src_stride_argb;
  }

--- a/source/rotate_common.cc
+++ b/source/rotate_common.cc
@ -191,10 +191,10 @@ void Transpose4x4_32_C(const uint8_t* src,
    ((uint32_t*)(dst3))[1] = p31;
    ((uint32_t*)(dst3))[2] = p32;
    ((uint32_t*)(dst3))[3] = p33;
-    src += (ptrdiff_t)src_stride * 4;  // advance 4 rows
-    src1 += (ptrdiff_t)src_stride * 4;
-    src2 += (ptrdiff_t)src_stride * 4;
-    src3 += (ptrdiff_t)src_stride * 4;
+    src += src_stride * 4;  // advance 4 rows
+    src1 += src_stride * 4;
+    src2 += src_stride * 4;
+    src3 += src_stride * 4;
    dst += 4 * 4;  // advance 4 columns
    dst1 += 4 * 4;
    dst2 += 4 * 4;
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@ -198,16 +198,16 @@ void Transpose4x4_32_NEON(const uint8_t* src,
      "vst1.8      {q3}, [%7]!                   \n"
      "bgt         1b                            \n"

-      : "+r"(src),                      // %0
-        "+r"(src1),                     // %1
-        "+r"(src2),                     // %2
-        "+r"(src3),                     // %3
-        "+r"(dst),                      // %4
-        "+r"(dst1),                     // %5
-        "+r"(dst2),                     // %6
-        "+r"(dst3),                     // %7
-        "+r"(width)                     // %8
-      : "r"((ptrdiff_t)src_stride * 4)  // %9
+      : "+r"(src),                        // %0
+        "+r"(src1),                       // %1
+        "+r"(src2),                       // %2
+        "+r"(src3),                       // %3
+        "+r"(dst),                        // %4
+        "+r"(dst1),                       // %5
+        "+r"(dst2),                       // %6
+        "+r"(dst3),                       // %7
+        "+r"(width)                       // %8
+      : "r"((ptrdiff_t)(src_stride * 4))  // %9
      : "memory", "cc", "q0", "q1", "q2", "q3");
 }

--- a/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@ -252,16 +252,16 @@ void Transpose4x4_32_NEON(const uint8_t* src,
      "st1         {v2.4s}, [%6], 16             \n"
      "st1         {v3.4s}, [%7], 16             \n"
      "b.gt        1b                            \n"
-      : "+r"(src),                      // %0
-        "+r"(src1),                     // %1
-        "+r"(src2),                     // %2
-        "+r"(src3),                     // %3
-        "+r"(dst),                      // %4
-        "+r"(dst1),                     // %5
-        "+r"(dst2),                     // %6
-        "+r"(dst3),                     // %7
-        "+r"(width)                     // %8
-      : "r"((ptrdiff_t)src_stride * 4)  // %9
+      : "+r"(src),                        // %0
+        "+r"(src1),                       // %1
+        "+r"(src2),                       // %2
+        "+r"(src3),                       // %3
+        "+r"(dst),                        // %4
+        "+r"(dst1),                       // %5
+        "+r"(dst2),                       // %6
+        "+r"(dst3),                       // %7
+        "+r"(width)                       // %8
+      : "r"((ptrdiff_t)(src_stride * 4))  // %9
      : "memory", "cc", "v0", "v1", "v2", "v3");
 }

--- a/source/rotate_win.cc
+++ b/source/rotate_win.cc
@ -64,7 +64,7 @@ __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
    mov       eax, ebp
    movdqa    xmm7, xmm6
    palignr   xmm7, xmm7, 8
-     // Second round of bit swap.
+    // Second round of bit swap.
    punpcklwd xmm0, xmm2
    punpcklwd xmm1, xmm3
    movdqa    xmm2, xmm0
@ -77,8 +77,8 @@ __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
    movdqa    xmm7, xmm5
    palignr   xmm6, xmm6, 8
    palignr   xmm7, xmm7, 8
-     // Third round of bit swap.
-     // Write to the destination pointer.
+    // Third round of bit swap.
+    // Write to the destination pointer.
    punpckldq xmm0, xmm4
    movq      qword ptr [edx], xmm0
    movdqa    xmm4, xmm0
@ -173,7 +173,7 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
    movdqa    xmm7, xmm5
    lea       eax, [eax + 8 * edi + 16]
    neg       edi
-         // Second round of bit swap.
+        // Second round of bit swap.
    movdqa    xmm5, xmm0
    punpcklwd xmm0, xmm2
    punpckhwd xmm5, xmm2
@ -193,8 +193,8 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
    punpckhwd xmm6, xmm7
    movdqa    xmm7, xmm6

-         // Third round of bit swap.
-         // Write to the destination pointer.
+        // Third round of bit swap.
+        // Write to the destination pointer.
    movdqa    xmm6, xmm0
    punpckldq xmm0, xmm4
    punpckhdq xmm6, xmm4
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -10,6 +10,7 @@

 #include "libyuv/row.h"

+#include <stddef.h>
 #include <string.h>  // For memset.

 #include "libyuv/basic_types.h"
@ -387,12 +388,6 @@ ANY31C(I444ToRGB24Row_Any_SSSE3, I444ToRGB24Row_SSSE3, 0, 0, 3, 15)
 #ifdef HAS_I422TORGB24ROW_AVX2
 ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
 #endif
-#ifdef HAS_I422TORGB24ROW_AVX512VBMI
-ANY31C(I422ToRGB24Row_Any_AVX512VBMI, I422ToRGB24Row_AVX512VBMI, 1, 0, 3, 31)
-#endif
-#ifdef HAS_I422TORGB24ROW_AVX512BW
-ANY31C(I422ToRGB24Row_Any_AVX512BW, I422ToRGB24Row_AVX512BW, 1, 0, 3, 31)
-#endif
 #ifdef HAS_I422TOARGBROW_AVX2
 ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
 #endif
@ -951,7 +946,9 @@ ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
 #if defined(HAS_ARGBTORGB24ROW_SSSE3)
 ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
 ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
-
+ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
 #endif
 #if defined(HAS_ARGBTORGB24ROW_AVX2)
 ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
@ -987,9 +984,8 @@ ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
 #if defined(HAS_ARGBTOAR30ROW_AVX2)
 ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
 #endif
-
-#if defined(HAS_J400TOARGBROW_AVX512BW)
-ANY11(J400ToARGBRow_Any_AVX512BW, J400ToARGBRow_AVX512BW, 0, 1, 4, 31)
+#if defined(HAS_J400TOARGBROW_SSE2)
+ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
 #endif
 #if defined(HAS_J400TOARGBROW_AVX2)
 ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
@ -997,14 +993,13 @@ ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
 ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
 ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
-
+ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
 #endif
 #if defined(HAS_RAWTOARGBROW_AVX2)
 ANY11(RAWToARGBRow_Any_AVX2, RAWToARGBRow_AVX2, 0, 3, 4, 31)
 #endif
-#if defined(HAS_RGB24TOARGBROW_AVX2)
-ANY11(RGB24ToARGBRow_Any_AVX2, RGB24ToARGBRow_AVX2, 0, 3, 4, 31)
-#endif
 #if defined(HAS_RAWTOARGBROW_AVX512BW)
 ANY11(RAWToARGBRow_Any_AVX512BW, RAWToARGBRow_AVX512BW, 0, 3, 4, 63)
 #endif
@ -1420,8 +1415,8 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
 // Any 1 to 1 with parameter.
 #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
-    SIMD_ALIGNED(uint8_t vin[(MASK + 1) * SBPP]);                              \
-    SIMD_ALIGNED(uint8_t vout[(MASK + 1) * BPP]);                              \
+    SIMD_ALIGNED(uint8_t vin[64]);                                             \
+    SIMD_ALIGNED(uint8_t vout[64]);                                            \
    memset(vin, 0, sizeof(vin)); /* for msan */                                \
    int r = width & MASK;                                                      \
    int n = width & ~MASK;                                                     \
@ -1467,6 +1462,14 @@ ANY11P(I400ToARGBRow_Any_LSX,
       15)
 #endif

+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
+       ARGBToRGB565DitherRow_SSE2,
+       const uint32_t,
+       4,
+       2,
+       3)
+#endif
 #if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
 ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
       ARGBToRGB565DitherRow_AVX2,
@ -1505,14 +1508,6 @@ ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
 ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)
 #endif
-#ifdef HAS_ARGBSHUFFLEROW_AVX512BW
-ANY11P(ARGBShuffleRow_Any_AVX512BW,
-       ARGBShuffleRow_AVX512BW,
-       const uint8_t*,
-       4,
-       4,
-       31)
-#endif
 #ifdef HAS_ARGBSHUFFLEROW_NEON
 ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
 #endif
@ -1835,9 +1830,18 @@ ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7)
    memcpy(dst_ptr + np * BPP, vout, r * BPP * sizeof(TD));          \
  }

-#if defined(HAS_INTERPOLATEROW_AVX2)
+#ifdef HAS_INTERPOLATEROW_AVX2
 ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31)
 #endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+ANY11I(InterpolateRow_Any_SSSE3,
+       InterpolateRow_SSSE3,
+       uint8_t,
+       uint8_t,
+       1,
+       1,
+       15)
+#endif
 #ifdef HAS_INTERPOLATEROW_NEON
 ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, uint8_t, 1, 1, 15)
 #endif
@ -1854,15 +1858,6 @@ ANY11I(InterpolateRow_16_Any_NEON,
       1,
       7)
 #endif
-#ifdef HAS_INTERPOLATEROW_16_AVX2
-ANY11I(InterpolateRow_16_Any_AVX2,
-       InterpolateRow_16_AVX2,
-       uint16_t,
-       uint16_t,
-       1,
-       1,
-       15)
-#endif
 #undef ANY11I

 // Any 1 to 1 interpolate with scale param
@ -1911,8 +1906,8 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2,
 // Any 1 to 1 mirror.
 #define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                          \
  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
-    SIMD_ALIGNED(uint8_t vin[128]);                                   \
-    SIMD_ALIGNED(uint8_t vout[128]);                                  \
+    SIMD_ALIGNED(uint8_t vin[64]);                                    \
+    SIMD_ALIGNED(uint8_t vout[64]);                                   \
    memset(vin, 0, sizeof(vin)); /* for msan */                       \
    int r = width & MASK;                                             \
    int n = width & ~MASK;                                            \
@ -1920,14 +1915,11 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2,
      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                        \
    }                                                                 \
    ptrdiff_t np = n;                                                 \
-    memcpy(vin, src_ptr, r * BPP);                                    \
+    memcpy(vin, src_ptr, r* BPP);                                     \
    ANY_SIMD(vin, vout, MASK + 1);                                    \
    memcpy(dst_ptr + np * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \
  }

-#ifdef HAS_MIRRORROW_AVX512BW
-ANY11M(MirrorRow_Any_AVX512BW, MirrorRow_AVX512BW, 1, 63)
-#endif
 #ifdef HAS_MIRRORROW_AVX2
 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
 #endif
@ -1946,6 +1938,9 @@ ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63)
 #ifdef HAS_MIRRORUVROW_AVX2
 ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
 #endif
+#ifdef HAS_MIRRORUVROW_SSSE3
+ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
+#endif
 #ifdef HAS_MIRRORUVROW_NEON
 ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
 #endif
@ -1970,8 +1965,8 @@ ANY11M(ARGBMirrorRow_Any_LSX, ARGBMirrorRow_LSX, 4, 7)
 #ifdef HAS_ARGBMIRRORROW_LASX
 ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15)
 #endif
-#ifdef HAS_RGB24MIRRORROW_AVX2
-ANY11M(RGB24MirrorRow_Any_AVX2, RGB24MirrorRow_AVX2, 3, 31)
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15)
 #endif
 #ifdef HAS_RGB24MIRRORROW_NEON
 ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
@ -2031,9 +2026,6 @@ ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3)
 #ifdef HAS_SPLITUVROW_SSE2
 ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
 #endif
-#ifdef HAS_SPLITUVROW_AVX512BW
-ANY12(SplitUVRow_Any_AVX512BW, SplitUVRow_AVX512BW, 0, 2, 0, 63)
-#endif
 #ifdef HAS_SPLITUVROW_AVX2
 ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
 #endif
@ -2205,7 +2197,7 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
               uint8_t* dst_v, int width) {                                  \
    SIMD_ALIGNED(uint8_t vin[256 * 2]);                                      \
    SIMD_ALIGNED(uint8_t vout[256 * 2]);                                     \
-    memset(vin, 0, sizeof(vin));   /* for msan */                            \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
    memset(vout, 0, sizeof(vout)); /* for msan */                            \
    int r = width & MASK;                                                    \
    int n = width & ~MASK;                                                   \
@ -2227,29 +2219,29 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
    memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1));                         \
  }

-#define ANY12M(NAMEANY, ANY_SIMD, BPP, MASK)                           \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \
-               int width, const struct ArgbConstants* c) {             \
-    SIMD_ALIGNED(uint8_t vin[256]);                                    \
-    SIMD_ALIGNED(uint8_t vout[256 * 2]);                               \
-    memset(vin, 0, sizeof(vin)); /* for msan */                        \
-    int r = width & MASK;                                              \
-    int n = width & ~MASK;                                             \
-    if (n > 0) {                                                       \
-      ANY_SIMD(src_ptr, dst_u, dst_v, n, c);                           \
-    }                                                                  \
-    memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP);     \
-    ANY_SIMD(vin, vout, vout + 256, MASK + 1, c);                      \
-    memcpy(dst_u + (ptrdiff_t)n, vout, (ptrdiff_t)r);                  \
-    memcpy(dst_v + (ptrdiff_t)n, vout + 256, (ptrdiff_t)r);            \
+#define ANY12M(NAMEANY, ANY_SIMD, BPP, MASK)                                 \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v,        \
+               int width, const struct ArgbConstants* c) {                   \
+    SIMD_ALIGNED(uint8_t vin[256]);                                          \
+    SIMD_ALIGNED(uint8_t vout[256 * 2]);                                     \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, dst_u, dst_v, n, c);                                 \
+    }                                                                        \
+    memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP);           \
+    ANY_SIMD(vin, vout, vout + 256, MASK + 1, c);                            \
+    memcpy(dst_u + (ptrdiff_t)n, vout, (ptrdiff_t)r);                        \
+    memcpy(dst_v + (ptrdiff_t)n, vout + 256, (ptrdiff_t)r);                  \
  }

 #define ANY12MS(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                       \
-  void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u,       \
-               uint8_t* dst_v, int width, const struct ArgbConstants* c) {   \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u,        \
+               uint8_t* dst_v, int width, const struct ArgbConstants* c) {    \
    SIMD_ALIGNED(uint8_t vin[256 * 2]);                                      \
    SIMD_ALIGNED(uint8_t vout[256 * 2]);                                     \
-    memset(vin, 0, sizeof(vin));   /* for msan */                            \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
    memset(vout, 0, sizeof(vout)); /* for msan */                            \
    int r = width & MASK;                                                    \
    int n = width & ~MASK;                                                   \
@ -2277,35 +2269,12 @@ ANY12MS(ARGBToUVMatrixRow_Any_NEON, ARGBToUVMatrixRow_NEON, 0, 4, 15)
 #ifdef HAS_ARGBTOUVMATRIXROW_NEON_I8MM
 ANY12MS(ARGBToUVMatrixRow_Any_NEON_I8MM, ARGBToUVMatrixRow_NEON_I8MM, 0, 4, 15)
 #endif
-#ifdef HAS_RGBTOUVMATRIXROW_NEON
-ANY12MS(RGBToUVMatrixRow_Any_NEON, RGBToUVMatrixRow_NEON, 0, 3, 15)
-#endif
-#ifdef HAS_RGB565TOUVMATRIXROW_NEON
-ANY12MS(RGB565ToUVMatrixRow_Any_NEON, RGB565ToUVMatrixRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOUVMATRIXROW_NEON
-ANY12MS(ARGB1555ToUVMatrixRow_Any_NEON, ARGB1555ToUVMatrixRow_NEON, 0, 2, 15)
-#endif
-#ifdef HAS_ARGB4444TOUVMATRIXROW_NEON
-ANY12MS(ARGB4444ToUVMatrixRow_Any_NEON, ARGB4444ToUVMatrixRow_NEON, 0, 2, 15)
-#endif
 #ifdef HAS_ARGBTOUVMATRIXROW_AVX2
-ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 31)
-ANY12MS(RGBToUVMatrixRow_Any_AVX2, RGBToUVMatrixRow_AVX2, 0, 3, 31)
-ANY12MS(RGB565ToUVMatrixRow_Any_AVX2, RGB565ToUVMatrixRow_AVX2, 0, 2, 31)
-#ifdef HAS_ARGB1555TOARGBROW_AVX2
-ANY12MS(ARGB1555ToUVMatrixRow_Any_AVX2, ARGB1555ToUVMatrixRow_AVX2, 0, 2, 31)
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_AVX2
-ANY12MS(ARGB4444ToUVMatrixRow_Any_AVX2, ARGB4444ToUVMatrixRow_AVX2, 0, 2, 31)
-#endif
+ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15)
 #endif
 #ifdef HAS_ARGBTOUVMATRIXROW_AVX512BW
 ANY12MS(ARGBToUVMatrixRow_Any_AVX512BW, ARGBToUVMatrixRow_AVX512BW, 0, 4, 63)
 #endif
-#ifdef HAS_RGBTOUVMATRIXROW_AVX512BW
-ANY12MS(RGBToUVMatrixRow_Any_AVX512BW, RGBToUVMatrixRow_AVX512BW, 0, 3, 63)
-#endif
 #ifdef HAS_ARGBTOUVMATRIXROW_SSSE3
 ANY12MS(ARGBToUVMatrixRow_Any_SSSE3, ARGBToUVMatrixRow_SSSE3, 0, 4, 7)
 #endif
@ -2322,20 +2291,20 @@ ANY12M(ARGBToUV444MatrixRow_Any_SSSE3, ARGBToUV444MatrixRow_SSSE3, 4, 15)
 ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7)
 #endif

-#define ANY11MC(NAMEANY, ANY_SIMD, BPP, MASK)                       \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width, \
-               const struct ArgbConstants* c) {                     \
-    SIMD_ALIGNED(uint8_t vin[256]);                                 \
-    SIMD_ALIGNED(uint8_t vout[256]);                                \
-    memset(vin, 0, sizeof(vin)); /* for msan */                     \
-    int r = width & MASK;                                           \
-    int n = width & ~MASK;                                          \
-    if (n > 0) {                                                    \
-      ANY_SIMD(src_ptr, dst_ptr, n, c);                             \
-    }                                                               \
-    memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP);  \
-    ANY_SIMD(vin, vout, MASK + 1, c);                               \
-    memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r);             \
+#define ANY11MC(NAMEANY, ANY_SIMD, BPP, MASK)                                \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width,          \
+               const struct ArgbConstants* c) {                              \
+    SIMD_ALIGNED(uint8_t vin[256]);                                          \
+    SIMD_ALIGNED(uint8_t vout[256]);                                         \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, dst_ptr, n, c);                                      \
+    }                                                                        \
+    memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP);           \
+    ANY_SIMD(vin, vout, MASK + 1, c);                                        \
+    memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r);                      \
  }

 #ifdef HAS_ARGBTOYROW_SSSE3
@ -2343,14 +2312,6 @@ ANY11MC(ARGBToYMatrixRow_Any_SSSE3, ARGBToYMatrixRow_SSSE3, 4, 15)
 #endif
 #ifdef HAS_ARGBTOYROW_AVX2
 ANY11MC(ARGBToYMatrixRow_Any_AVX2, ARGBToYMatrixRow_AVX2, 4, 31)
-ANY11MC(RGBToYMatrixRow_Any_AVX2, RGBToYMatrixRow_AVX2, 3, 31)
-ANY11MC(RGB565ToYMatrixRow_Any_AVX2, RGB565ToYMatrixRow_AVX2, 2, 31)
-#ifdef HAS_ARGB1555TOYMATRIXROW_AVX2
-ANY11MC(ARGB1555ToYMatrixRow_Any_AVX2, ARGB1555ToYMatrixRow_AVX2, 2, 31)
-#endif
-#ifdef HAS_ARGB4444TOYMATRIXROW_AVX2
-ANY11MC(ARGB4444ToYMatrixRow_Any_AVX2, ARGB4444ToYMatrixRow_AVX2, 2, 31)
-#endif
 #endif
 #ifdef HAS_ARGBTOYROW_AVX512BW
 ANY11MC(ARGBToYMatrixRow_Any_AVX512BW, ARGBToYMatrixRow_AVX512BW, 4, 63)
@ -2361,18 +2322,6 @@ ANY11MC(ARGBToYMatrixRow_Any_NEON, ARGBToYMatrixRow_NEON, 4, 15)
 #ifdef HAS_ARGBTOYMATRIXROW_NEON_DOTPROD
 ANY11MC(ARGBToYMatrixRow_Any_NEON_DotProd, ARGBToYMatrixRow_NEON_DotProd, 4, 15)
 #endif
-#ifdef HAS_RGBTOYMATRIXROW_NEON
-ANY11MC(RGBToYMatrixRow_Any_NEON, RGBToYMatrixRow_NEON, 3, 15)
-#endif
-#ifdef HAS_RGB565TOYMATRIXROW_NEON
-ANY11MC(RGB565ToYMatrixRow_Any_NEON, RGB565ToYMatrixRow_NEON, 2, 15)
-#endif
-#ifdef HAS_ARGB1555TOYMATRIXROW_NEON
-ANY11MC(ARGB1555ToYMatrixRow_Any_NEON, ARGB1555ToYMatrixRow_NEON, 2, 15)
-#endif
-#ifdef HAS_ARGB4444TOYMATRIXROW_NEON
-ANY11MC(ARGB4444ToYMatrixRow_Any_NEON, ARGB4444ToYMatrixRow_NEON, 2, 15)
-#endif
 #ifdef HAS_ARGBTOYMATRIXROW_LSX
 ANY11MC(ARGBToYMatrixRow_Any_LSX, ARGBToYMatrixRow_LSX, 4, 15)
 #endif
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -14,7 +14,7 @@
 #include <string.h>  // For memcpy and memset.

 #include "libyuv/basic_types.h"
-#include "libyuv/convert_argb.h"       // For kYuvI601Constants
+#include "libyuv/convert_argb.h"  // For kYuvI601Constants
 #include "libyuv/convert_from_argb.h"  // For ArgbConstants

 #ifdef __cplusplus
@ -37,6 +37,10 @@ extern "C" {
 // LIBYUV_UNLIMITED_BT709
 // LIBYUV_UNLIMITED_BT2020

+#if defined(LIBYUV_BIT_EXACT)
+#define LIBYUV_UNATTENUATE_DUP 1
+#endif
+
 // llvm x86 is poor at ternary operator, so use branchless min/max.

 #define USE_BRANCHLESS 1
@ -749,31 +753,28 @@ MAKEROWYJ(ABGR, 0, 1, 2, 4)
 MAKEROWYJ(RGBA, 3, 2, 1, 4)
 #undef MAKEROWYJ

-static __inline uint8_t RGBToYMatrix(uint8_t b0,
-                                     uint8_t b1,
-                                     uint8_t b2,
-                                     uint8_t b3,
+static __inline uint8_t RGBToYMatrix(uint8_t r,
+                                     uint8_t g,
+                                     uint8_t b,
                                     const struct ArgbConstants* c) {
-  return (c->kRGBToY[0] * b0 + c->kRGBToY[1] * b1 + c->kRGBToY[2] * b2 +
-          c->kRGBToY[3] * b3 + c->kAddY[0]) >>
+  return (c->kRGBToY[2] * r + c->kRGBToY[1] * g + c->kRGBToY[0] * b +
+          c->kAddY[0]) >>
         8;
 }
-static __inline uint8_t RGBToUMatrix(uint8_t b0,
-                                     uint8_t b1,
-                                     uint8_t b2,
-                                     uint8_t b3,
+static __inline uint8_t RGBToUMatrix(uint8_t r,
+                                     uint8_t g,
+                                     uint8_t b,
                                     const struct ArgbConstants* c) {
-  return (c->kAddUV[0] - (c->kRGBToU[0] * b0 + c->kRGBToU[1] * b1 +
-                          c->kRGBToU[2] * b2 + c->kRGBToU[3] * b3)) >>
+  return (c->kAddUV[0] -
+          (c->kRGBToU[2] * r + c->kRGBToU[1] * g + c->kRGBToU[0] * b)) >>
         8;
 }
-static __inline uint8_t RGBToVMatrix(uint8_t b0,
-                                     uint8_t b1,
-                                     uint8_t b2,
-                                     uint8_t b3,
+static __inline uint8_t RGBToVMatrix(uint8_t r,
+                                     uint8_t g,
+                                     uint8_t b,
                                     const struct ArgbConstants* c) {
-  return (c->kAddUV[0] - (c->kRGBToV[0] * b0 + c->kRGBToV[1] * b1 +
-                          c->kRGBToV[2] * b2 + c->kRGBToV[3] * b3)) >>
+  return (c->kAddUV[0] -
+          (c->kRGBToV[2] * r + c->kRGBToV[1] * g + c->kRGBToV[0] * b)) >>
         8;
 }

@ -783,8 +784,7 @@ void ARGBToYMatrixRow_C(const uint8_t* src_argb,
                        const struct ArgbConstants* c) {
  int x;
  for (x = 0; x < width; ++x) {
-    dst_y[0] =
-        RGBToYMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
+    dst_y[0] = RGBToYMatrix(src_argb[2], src_argb[1], src_argb[0], c);
    src_argb += 4;
    dst_y += 1;
  }
@ -799,28 +799,25 @@ void ARGBToUVMatrixRow_C(const uint8_t* src_argb,
  const uint8_t* src_argb1 = src_argb + src_stride_argb;
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    uint8_t b0 =
+    uint8_t ab =
        (src_argb[0] + src_argb[4] + src_argb1[0] + src_argb1[4] + 2) >> 2;
-    uint8_t b1 =
+    uint8_t ag =
        (src_argb[1] + src_argb[5] + src_argb1[1] + src_argb1[5] + 2) >> 2;
-    uint8_t b2 =
+    uint8_t ar =
        (src_argb[2] + src_argb[6] + src_argb1[2] + src_argb1[6] + 2) >> 2;
-    uint8_t b3 =
-        (src_argb[3] + src_argb[7] + src_argb1[3] + src_argb1[7] + 2) >> 2;
-    dst_u[0] = RGBToUMatrix(b0, b1, b2, b3, c);
-    dst_v[0] = RGBToVMatrix(b0, b1, b2, b3, c);
+    dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
+    dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
    src_argb += 8;
    src_argb1 += 8;
    dst_u += 1;
    dst_v += 1;
  }
  if (width & 1) {
-    uint8_t b0 = (src_argb[0] + src_argb1[0] + 1) >> 1;
-    uint8_t b1 = (src_argb[1] + src_argb1[1] + 1) >> 1;
-    uint8_t b2 = (src_argb[2] + src_argb1[2] + 1) >> 1;
-    uint8_t b3 = (src_argb[3] + src_argb1[3] + 1) >> 1;
-    dst_u[0] = RGBToUMatrix(b0, b1, b2, b3, c);
-    dst_v[0] = RGBToVMatrix(b0, b1, b2, b3, c);
+    uint8_t ab = (src_argb[0] + src_argb1[0] + 1) >> 1;
+    uint8_t ag = (src_argb[1] + src_argb1[1] + 1) >> 1;
+    uint8_t ar = (src_argb[2] + src_argb1[2] + 1) >> 1;
+    dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
+    dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
  }
 }

@ -831,10 +828,11 @@ void ARGBToUV444MatrixRow_C(const uint8_t* src_argb,
                            const struct ArgbConstants* c) {
  int x;
  for (x = 0; x < width; ++x) {
-    dst_u[0] =
-        RGBToUMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
-    dst_v[0] =
-        RGBToVMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
+    uint8_t ab = src_argb[0];
+    uint8_t ag = src_argb[1];
+    uint8_t ar = src_argb[2];
+    dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
+    dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
    src_argb += 4;
    dst_u += 1;
    dst_v += 1;
@ -1514,18 +1512,18 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
      YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);

-#define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV) \
-  extern const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) =   \
-      ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV), \
-                        -(RV), 0, AY, AUV);                                  \
-  extern const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) =   \
-      ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV), \
-                        -(BV), 0, AY, AUV);                                  \
-  extern const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) =   \
-      ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV),     \
-                        -(GV), -(RV), AY, AUV);                              \
-  extern const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) =   \
-      ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV),     \
+#define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV)   \
+  const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) =            \
+      ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV),   \
+                        -(RV), 0, AY, AUV);                                    \
+  const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) =            \
+      ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV),   \
+                        -(BV), 0, AY, AUV);                                    \
+  const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) =            \
+      ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV),       \
+                        -(GV), -(RV), AY, AUV);                                \
+  const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) =            \
+      ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV),       \
                        -(GV), -(BV), AY, AUV);

 // BT.601 limited range RGB to YUV coefficients
@ -3468,7 +3466,7 @@ void ARGBBlendRow_C(const uint8_t* src_argb,
 }
 #undef BLEND

-#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8
+#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
 void BlendPlaneRow_C(const uint8_t* src0,
                     const uint8_t* src1,
                     const uint8_t* alpha,
@ -3575,8 +3573,12 @@ const uint32_t fixed_invtbl8[256] = {
    T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};
 #undef T

+#if defined(LIBYUV_UNATTENUATE_DUP)
 // This code mimics the Intel SIMD version for better testability.
 #define UNATTENUATE(f, ia) clamp255(((f | (f << 8)) * ia) >> 16)
+#else
+#define UNATTENUATE(f, ia) clamp255((f * ia) >> 8)
+#endif

 // mimics the Intel SIMD code for exactness.
 void ARGBUnattenuateRow_C(const uint8_t* src_argb,
@ -3664,8 +3666,7 @@ void ARGBAffineRow_C(const uint8_t* src_argb,
    int x = (int)(uv[0]);
    int y = (int)(uv[1]);
    *(uint32_t*)(dst_argb) =
-        *(const uint32_t*)(src_argb + (ptrdiff_t)y * src_argb_stride +
-                           (ptrdiff_t)x * 4);
+        *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4);
    dst_argb += 4;
    uv[0] += uv_dudv[2];
    uv[1] += uv_dudv[3];
@ -4171,7 +4172,7 @@ void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
 }
 #endif

-#if defined(HAS_NV12TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2)
+#if defined(HAS_NV12TORGB24ROW_AVX2)
 void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
                         const uint8_t* src_uv,
                         uint8_t* dst_rgb24,
@ -4182,7 +4183,11 @@ void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
    src_y += twidth;
    src_uv += twidth;
    dst_rgb24 += twidth * 3;
@ -4191,7 +4196,7 @@ void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
 }
 #endif

-#if defined(HAS_NV21TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2)
+#if defined(HAS_NV21TORGB24ROW_AVX2)
 void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
                         const uint8_t* src_vu,
                         uint8_t* dst_rgb24,
@ -4202,7 +4207,11 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
    src_y += twidth;
    src_vu += twidth;
    dst_rgb24 += twidth * 3;
@ -4211,7 +4220,7 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
 }
 #endif

-#if defined(HAS_I422TOARGBROW_AVX2) && defined(HAS_ARGBTORGB565ROW_AVX2)
+#if defined(HAS_I422TORGB565ROW_AVX2)
 void I422ToRGB565Row_AVX2(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
@ -4222,7 +4231,11 @@ void I422ToRGB565Row_AVX2(const uint8_t* src_y,
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
    src_y += twidth;
    src_u += twidth / 2;
    src_v += twidth / 2;
@ -4232,7 +4245,7 @@ void I422ToRGB565Row_AVX2(const uint8_t* src_y,
 }
 #endif

-#if defined(HAS_I422TOARGBROW_AVX2) && defined(HAS_ARGBTOARGB1555ROW_AVX2)
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
 void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
@ -4244,7 +4257,11 @@ void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
    ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+#else
+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+#endif
    src_y += twidth;
    src_u += twidth / 2;
    src_v += twidth / 2;
@ -4254,7 +4271,7 @@ void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
 }
 #endif

-#if defined(HAS_I422TOARGBROW_AVX2) && defined(HAS_ARGBTOARGB4444ROW_AVX2)
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
 void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
                            const uint8_t* src_u,
                            const uint8_t* src_v,
@ -4266,7 +4283,11 @@ void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
    ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+#else
+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+#endif
    src_y += twidth;
    src_u += twidth / 2;
    src_v += twidth / 2;
@ -4276,7 +4297,7 @@ void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
 }
 #endif

-#if defined(HAS_I422TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2)
+#if defined(HAS_I422TORGB24ROW_AVX2)
 void I422ToRGB24Row_AVX2(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@ -4288,7 +4309,11 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
    src_y += twidth;
    src_u += twidth / 2;
    src_v += twidth / 2;
@ -4298,51 +4323,7 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
 }
 #endif

-#if defined(HAS_I422TOARGBROW_AVX512BW) && defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
-void I422ToRGB24Row_AVX512VBMI(const uint8_t* src_y,
-                               const uint8_t* src_u,
-                               const uint8_t* src_v,
-                               uint8_t* dst_rgb24,
-                               const struct YuvConstants* yuvconstants,
-                               int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX512BW(src_y, src_u, src_v, row, yuvconstants, twidth);
-    ARGBToRGB24Row_AVX512VBMI(row, dst_rgb24, twidth);
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_rgb24 += twidth * 3;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I422TOARGBROW_AVX512BW) && defined(HAS_ARGBTORGB24ROW_AVX2)
-void I422ToRGB24Row_AVX512BW(const uint8_t* src_y,
-                             const uint8_t* src_u,
-                             const uint8_t* src_v,
-                             uint8_t* dst_rgb24,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
-  // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    I422ToARGBRow_AVX512BW(src_y, src_u, src_v, row, yuvconstants, twidth);
-    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
-    src_y += twidth;
-    src_u += twidth / 2;
-    src_v += twidth / 2;
-    dst_rgb24 += twidth * 3;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_I444TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2)
+#if defined(HAS_I444TORGB24ROW_AVX2)
 void I444ToRGB24Row_AVX2(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
@ -4354,7 +4335,11 @@ void I444ToRGB24Row_AVX2(const uint8_t* src_y,
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    I444ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
    src_y += twidth;
    src_u += twidth;
    src_v += twidth;
@ -4364,7 +4349,7 @@ void I444ToRGB24Row_AVX2(const uint8_t* src_y,
 }
 #endif

-#if defined(HAS_NV12TOARGBROW_AVX2) && defined(HAS_ARGBTORGB565ROW_AVX2)
+#if defined(HAS_NV12TORGB565ROW_AVX2)
 void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
                          const uint8_t* src_uv,
                          uint8_t* dst_rgb565,
@ -4375,7 +4360,11 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+#else
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+#endif
    src_y += twidth;
    src_uv += twidth;
    dst_rgb565 += twidth * 2;
@ -4384,6 +4373,26 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
 }
 #endif

+#ifdef HAS_RGB24TOYJROW_AVX2
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+}
+#endif  // HAS_RGB24TOYJROW_AVX2
+
+#ifdef HAS_RAWTOYJROW_AVX2
+// Convert 32 RAW pixels (128 bytes) to 32 YJ values.
+}
+#endif  // HAS_RAWTOYJROW_AVX2
+
+#ifdef HAS_RGB24TOYJROW_SSSE3
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+}
+#endif  // HAS_RGB24TOYJROW_SSSE3
+
+#ifdef HAS_RAWTOYJROW_SSSE3
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+}
+#endif  // HAS_RAWTOYJROW_SSSE3
+
 #ifdef HAS_INTERPOLATEROW_16TO8_AVX2
 void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
                               const uint16_t* src_ptr,
@ -4395,7 +4404,7 @@ void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
  SIMD_ALIGNED(uint16_t row[MAXTWIDTH]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    InterpolateRow_16_AVX2(row, src_ptr, src_stride, twidth, source_y_fraction);
+    InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction);
    Convert16To8Row_AVX2(row, dst_ptr, scale, twidth);
    src_ptr += twidth;
    dst_ptr += twidth;
@ -4601,465 +4610,6 @@ void HalfMergeUVRow_C(const uint8_t* src_u,

 #undef STATIC_CAST

-void RGBToYMatrixRow_C(const uint8_t* src_rgb,
-                       uint8_t* dst_y,
-                       int width,
-                       const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB24ToARGBRow_C(src_rgb, row, twidth);
-    ARGBToYMatrixRow_C(row, dst_y, twidth, c);
-    src_rgb += twidth * 3;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-
-void RGBToUVMatrixRow_C(const uint8_t* src_rgb,
-                        int src_stride_rgb,
-                        uint8_t* dst_u,
-                        uint8_t* dst_v,
-                        int width,
-                        const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB24ToARGBRow_C(src_rgb, row, twidth);
-    RGB24ToARGBRow_C(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4, twidth);
-    ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_rgb += twidth * 3;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2) && defined(HAS_RGB24TOARGBROW_AVX2)
-void RGBToYMatrixRow_AVX2(const uint8_t* src_rgb,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB24ToARGBRow_AVX2(src_rgb, row, twidth);
-    ARGBToYMatrixRow_AVX2(row, dst_y, twidth, c);
-    src_rgb += twidth * 3;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_AVX2) && defined(HAS_RGB24TOARGBROW_AVX2)
-void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb,
-                           int src_stride_rgb,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width,
-                           const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB24ToARGBRow_AVX2(src_rgb, row, twidth);
-    RGB24ToARGBRow_AVX2(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4, twidth);
-    ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_rgb += twidth * 3;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW) && \
-    defined(HAS_RGB24TOARGBROW_AVX512BW)
-void RGBToUVMatrixRow_AVX512BW(const uint8_t* src_rgb,
-                               int src_stride_rgb,
-                               uint8_t* dst_u,
-                               uint8_t* dst_v,
-                               int width,
-                               const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB24ToARGBRow_AVX512BW(src_rgb, row, twidth);
-    RGB24ToARGBRow_AVX512BW(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4,
-                            twidth);
-    ARGBToUVMatrixRow_AVX512BW(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_rgb += twidth * 3;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON) && defined(HAS_RGB24TOARGBROW_NEON)
-void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb,
-                           int src_stride_rgb,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width,
-                           const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB24ToARGBRow_NEON(src_rgb, row, twidth);
-    RGB24ToARGBRow_NEON(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4, twidth);
-    ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_rgb += twidth * 3;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
-void RGB565ToYMatrixRow_C(const uint8_t* src_rgb565,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB565ToARGBRow_C(src_rgb565, row, twidth);
-    ARGBToYMatrixRow_C(row, dst_y, twidth, c);
-    src_rgb565 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-
-void RGB565ToUVMatrixRow_C(const uint8_t* src_rgb565,
-                           int src_stride_rgb565,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width,
-                           const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB565ToARGBRow_C(src_rgb565, row, twidth);
-    RGB565ToARGBRow_C(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4,
-                      twidth);
-    ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_rgb565 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2) && defined(HAS_RGB565TOARGBROW_AVX2)
-void RGB565ToYMatrixRow_AVX2(const uint8_t* src_rgb565,
-                             uint8_t* dst_y,
-                             int width,
-                             const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB565ToARGBRow_AVX2(src_rgb565, row, twidth);
-    ARGBToYMatrixRow_AVX2(row, dst_y, twidth, c);
-    src_rgb565 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_AVX2) && defined(HAS_RGB565TOARGBROW_AVX2)
-void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565,
-                              int src_stride_rgb565,
-                              uint8_t* dst_u,
-                              uint8_t* dst_v,
-                              int width,
-                              const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB565ToARGBRow_AVX2(src_rgb565, row, twidth);
-    RGB565ToARGBRow_AVX2(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4,
-                         twidth);
-    ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_rgb565 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_RGB565TOARGBROW_NEON) && defined(HAS_ARGBTOYMATRIXROW_NEON)
-void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565,
-                             uint8_t* dst_y,
-                             int width,
-                             const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB565ToARGBRow_NEON(src_rgb565, row, twidth);
-    ARGBToYMatrixRow_NEON(row, dst_y, twidth, c);
-    src_rgb565 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_RGB565TOARGBROW_NEON) && defined(HAS_ARGBTOUVMATRIXROW_NEON)
-void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565,
-                              int src_stride_rgb565,
-                              uint8_t* dst_u,
-                              uint8_t* dst_v,
-                              int width,
-                              const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    RGB565ToARGBRow_NEON(src_rgb565, row, twidth);
-    RGB565ToARGBRow_NEON(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4,
-                         twidth);
-    ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_rgb565 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
-void ARGB1555ToYMatrixRow_C(const uint8_t* src_argb1555,
-                            uint8_t* dst_y,
-                            int width,
-                            const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB1555ToARGBRow_C(src_argb1555, row, twidth);
-    ARGBToYMatrixRow_C(row, dst_y, twidth, c);
-    src_argb1555 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-
-void ARGB1555ToUVMatrixRow_C(const uint8_t* src_argb1555,
-                             int src_stride_argb1555,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width,
-                             const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB1555ToARGBRow_C(src_argb1555, row, twidth);
-    ARGB1555ToARGBRow_C(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4,
-                        twidth);
-    ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_argb1555 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-
-void ARGB4444ToYMatrixRow_C(const uint8_t* src_argb4444,
-                            uint8_t* dst_y,
-                            int width,
-                            const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB4444ToARGBRow_C(src_argb4444, row, twidth);
-    ARGBToYMatrixRow_C(row, dst_y, twidth, c);
-    src_argb4444 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-
-void ARGB4444ToUVMatrixRow_C(const uint8_t* src_argb4444,
-                             int src_stride_argb4444,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width,
-                             const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB4444ToARGBRow_C(src_argb4444, row, twidth);
-    ARGB4444ToARGBRow_C(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4,
-                        twidth);
-    ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_argb4444 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-void ARGB1555ToYMatrixRow_AVX2(const uint8_t* src_argb1555,
-                               uint8_t* dst_y,
-                               int width,
-                               const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB1555ToARGBRow_AVX2(src_argb1555, row, twidth);
-    ARGBToYMatrixRow_AVX2(row, dst_y, twidth, c);
-    src_argb1555 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-void ARGB4444ToYMatrixRow_AVX2(const uint8_t* src_argb4444,
-                               uint8_t* dst_y,
-                               int width,
-                               const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB4444ToARGBRow_AVX2(src_argb4444, row, twidth);
-    ARGBToYMatrixRow_AVX2(row, dst_y, twidth, c);
-    src_argb4444 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-#endif
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
-#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-void ARGB1555ToUVMatrixRow_AVX2(const uint8_t* src_argb1555,
-                                int src_stride_argb1555,
-                                uint8_t* dst_u,
-                                uint8_t* dst_v,
-                                int width,
-                                const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB1555ToARGBRow_AVX2(src_argb1555, row, twidth);
-    ARGB1555ToARGBRow_AVX2(src_argb1555 + src_stride_argb1555,
-                           row + MAXTWIDTH * 4, twidth);
-    ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_argb1555 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-void ARGB4444ToUVMatrixRow_AVX2(const uint8_t* src_argb4444,
-                                int src_stride_argb4444,
-                                uint8_t* dst_u,
-                                uint8_t* dst_v,
-                                int width,
-                                const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB4444ToARGBRow_AVX2(src_argb4444, row, twidth);
-    ARGB4444ToARGBRow_AVX2(src_argb4444 + src_stride_argb4444,
-                           row + MAXTWIDTH * 4, twidth);
-    ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_argb4444 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-#endif
-
-#if defined(HAS_ARGBTOYMATRIXROW_NEON) && defined(HAS_ARGB1555TOARGBROW_NEON)
-void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555,
-                               uint8_t* dst_y,
-                               int width,
-                               const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB1555ToARGBRow_NEON(src_argb1555, row, twidth);
-    ARGBToYMatrixRow_NEON(row, dst_y, twidth, c);
-    src_argb1555 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGBTOYMATRIXROW_NEON) && defined(HAS_ARGB4444TOARGBROW_NEON)
-void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444,
-                               uint8_t* dst_y,
-                               int width,
-                               const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB4444ToARGBRow_NEON(src_argb4444, row, twidth);
-    ARGBToYMatrixRow_NEON(row, dst_y, twidth, c);
-    src_argb4444 += twidth * 2;
-    dst_y += twidth;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON) && defined(HAS_ARGB1555TOARGBROW_NEON)
-void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555,
-                                int src_stride_argb1555,
-                                uint8_t* dst_u,
-                                uint8_t* dst_v,
-                                int width,
-                                const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB1555ToARGBRow_NEON(src_argb1555, row, twidth);
-    ARGB1555ToARGBRow_NEON(src_argb1555 + src_stride_argb1555,
-                           row + MAXTWIDTH * 4, twidth);
-    ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_argb1555 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON) && defined(HAS_ARGB4444TOARGBROW_NEON)
-void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444,
-                                int src_stride_argb4444,
-                                uint8_t* dst_u,
-                                uint8_t* dst_v,
-                                int width,
-                                const struct ArgbConstants* c) {
-  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
-  while (width > 0) {
-    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
-    ARGB4444ToARGBRow_NEON(src_argb4444, row, twidth);
-    ARGB4444ToARGBRow_NEON(src_argb4444 + src_stride_argb4444,
-                           row + MAXTWIDTH * 4, twidth);
-    ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
-    src_argb4444 += twidth * 2;
-    dst_u += twidth / 2;
-    dst_v += twidth / 2;
-    width -= twidth;
-  }
-}
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
--- a/source/row_lasx.cc
+++ b/source/row_lasx.cc
@ -2027,12 +2027,10 @@ struct ArgbConstants {
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
 static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
-                                                         128,
-                                                         0};
+                                                        128,
+                                                        0};

-static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
-                                                       128,
-                                                       0};
+static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};

 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@ -2041,19 +2039,19 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
 // Add 16.5 = 0x1080

 static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                         0x1080,
-                                                         0};
+                                                        0x1080,
+                                                        0};

 static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0},
-                                                       0x1080,
-                                                       0};
+                                                      0x1080,
+                                                      0};
 #endif  // ArgbConstants

 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
-                           uint8_t* dst_y,
-                           int width,
-                           const struct ArgbConstants* c) {
+                                  uint8_t* dst_y,
+                                  int width,
+                                  const struct ArgbConstants* c) {
  int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
  asm volatile(
      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
@ -2218,14 +2216,18 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
      "xvst            $xr10, %1,    0             \n\t"
      "addi.d          %1,    %1,    32            \n\t"
      "bnez            %2,    1b                   \n\t"
-      : "+&r"(src_rgba),  // %0
-        "+&r"(dst_y),     // %1
-        "+&r"(width)      // %2
-      : "r"(c),           // %3
-        "r"(shuff)        // %4
+      : "+&r"(src_rgba),    // %0
+        "+&r"(dst_y),       // %1
+        "+&r"(width)        // %2
+      : "r"(c),  // %3
+        "r"(shuff)          // %4
      : "memory");
 }

+
+
+
+
 void ARGBToUVJRow_LASX(const uint8_t* src_argb,
                       int src_stride_argb,
                       uint8_t* dst_u,
--- a/source/row_lsx.cc
+++ b/source/row_lsx.cc
@ -2812,12 +2812,10 @@ struct ArgbConstants {
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
 static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
-                                                         128,
-                                                         0};
+                                                        128,
+                                                        0};

-static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
-                                                       128,
-                                                       0};
+static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};

 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@ -2826,19 +2824,19 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
 // Add 16.5 = 0x1080

 static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                         0x1080,
-                                                         0};
+                                                        0x1080,
+                                                        0};

 static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0},
-                                                       0x1080,
-                                                       0};
+                                                      0x1080,
+                                                      0};
 #endif  // ArgbConstants

 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c) {
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct ArgbConstants* c) {
  asm volatile(
      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
@ -2989,14 +2987,18 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
      "vst            $vr10, %1,    0             \n\t"
      "addi.d         %1,    %1,    16            \n\t"
      "bnez           %2,    1b                   \n\t"
-      : "+&r"(src_rgba),  // %0
-        "+&r"(dst_y),     // %1
-        "+&r"(width)      // %2
-      : "r"(c),           // %3
-        "r"(shuff)        // %4
+      : "+&r"(src_rgba),    // %0
+        "+&r"(dst_y),       // %1
+        "+&r"(width)        // %2
+      : "r"(c),  // %3
+        "r"(shuff)          // %4
      : "memory");
 }

+
+
+
+
 // undef for unified sources build
 #undef YUVTORGB_SETUP
 #undef READYUV422_D
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -8,8 +8,8 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "libyuv/convert_from_argb.h"  // For ArgbConstants
 #include "libyuv/row.h"
+#include "libyuv/convert_from_argb.h"  // For ArgbConstants

 #ifdef __cplusplus
 namespace libyuv {
@ -272,7 +272,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
      "subs        %[width], %[width], #8        \n"  //
      YUVTORGB                                        //
          RGBTORGB8                                   //
-      STORERGBA                                       //
+              STORERGBA                               //
      "bgt         1b                            \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
@ -325,8 +325,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "1:          \n"  //
-      READYUV422 "subs        %[width], %[width], #8        \n" YUVTORGB
-          RGBTORGB8 ARGBTORGB565
+      READYUV422
+      "subs        %[width], %[width], #8        \n" YUVTORGB RGBTORGB8
+          ARGBTORGB565
      "vst1.8      {q2}, [%[dst_rgb565]]!        \n"  // store 8 pixels RGB565.
      "bgt         1b                            \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
@ -1847,54 +1848,45 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
                               int width,
                               const struct ArgbConstants* c) {
  asm volatile(
-      "vld1.8      {d24}, [%4]                   \n"  // load kRGBToU
-      "vld1.8      {d25}, [%5]                   \n"  // load kRGBToV
-      "vld1.16     {d26[0]}, [%6]                \n"  // load kAddUV[0]
-      "vmovl.s8    q10, d24                      \n"  // U coeffs (8 shorts)
-      "vmovl.s8    q11, d25                      \n"  // V coeffs (8 shorts)
-      "vdup.16     q6, d26[0]                    \n"  // bias
+      "vld1.8      {d16}, [%4]                   \n"  // load kRGBToU
+      "vld1.8      {d17}, [%5]                   \n"  // load kRGBToV
+      "vld1.16     {d18[0]}, [%6]                \n"  // load kAddUV[0]
+      "vabs.s8     d16, d16                      \n"  // BU, GU, RU
+      "vabs.s8     d17, d17                      \n"  // BV, GV, RV
+      "vdup.8      d20, d16[0]                   \n"  // BU
+      "vdup.8      d21, d16[1]                   \n"  // GU
+      "vdup.8      d22, d16[2]                   \n"  // RU
+      "vdup.8      d23, d17[0]                   \n"  // BV
+      "vdup.8      d24, d17[1]                   \n"  // GV
+      "vdup.8      d25, d17[2]                   \n"  // RV
+      "vdup.16     q15, d18[0]                   \n"  // kAddUV
+
      "1:          \n"
      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
+      "vmull.u8    q2, d0, d20                   \n"  // B * BU
+      "vmlsl.u8    q2, d1, d21                   \n"  // - G * GU
+      "vmlsl.u8    q2, d2, d22                   \n"  // - R * RU

-      "vmovl.u8    q4, d0                        \n"  // B
-      "vmovl.u8    q5, d1                        \n"  // G
-      "vmovl.u8    q7, d2                        \n"  // R
-      "vmovl.u8    q8, d3                        \n"  // A
+      "vmull.u8    q3, d2, d25                   \n"  // R * RV
+      "vmlsl.u8    q3, d1, d24                   \n"  // - G * GV
+      "vmlsl.u8    q3, d0, d23                   \n"  // - B * BV

-      "vdup.16     q12, d20[0]                   \n"
-      "vmul.s16    q2, q4, q12                   \n"  // U = B * U0
-      "vdup.16     q12, d20[1]                   \n"
-      "vmla.s16    q2, q5, q12                   \n"  // U += G * U1
-      "vdup.16     q12, d20[2]                   \n"
-      "vmla.s16    q2, q7, q12                   \n"  // U += R * U2
-      "vdup.16     q12, d20[3]                   \n"
-      "vmla.s16    q2, q8, q12                   \n"  // U += A * U3
-
-      "vdup.16     q12, d22[0]                   \n"
-      "vmul.s16    q3, q4, q12                   \n"  // V = B * V0
-      "vdup.16     q12, d22[1]                   \n"
-      "vmla.s16    q3, q5, q12                   \n"  // V += G * V1
-      "vdup.16     q12, d22[2]                   \n"
-      "vmla.s16    q3, q7, q12                   \n"  // V += R * V2
-      "vdup.16     q12, d22[3]                   \n"
-      "vmla.s16    q3, q8, q12                   \n"  // V += A * V3
-
-      "vsubhn.s16  d0, q6, q2                    \n"  // 128.0 - U
-      "vsubhn.s16  d1, q6, q3                    \n"  // 128.0 - V
+      "vaddhn.u16  d0, q2, q15                   \n"  // signed -> unsigned
+      "vaddhn.u16  d1, q3, q15                   \n"

      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels U.
      "vst1.8      {d1}, [%2]!                   \n"  // store 8 pixels V.
      "bgt         1b                            \n"
-      : "+r"(src_argb),    // %0
-        "+r"(dst_u),       // %1
-        "+r"(dst_v),       // %2
-        "+r"(width)        // %3
-      : "r"(&c->kRGBToU),  // %4
-        "r"(&c->kRGBToV),  // %5
-        "r"(&c->kAddUV)    // %6
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q10", "q11", "q12");
+      : "+r"(src_argb),     // %0
+        "+r"(dst_u),        // %1
+        "+r"(dst_v),        // %2
+        "+r"(width)         // %3
+      : "r"(&c->kRGBToU),   // %4
+        "r"(&c->kRGBToV),   // %5
+        "r"(&c->kAddUV)     // %6
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
 }

 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
@ -1911,6 +1903,7 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbJPEGConstants);
 }

+
 // clang-format off
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
 #define RGBTOUV(QB, QG, QR)                                                 \
@ -1932,68 +1925,61 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
                            int width,
                            const struct ArgbConstants* c) {
  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile(
-      "vld1.8      {d24}, [%5]                   \n"  // load kRGBToU (8 bytes,
-                                                      // only 4 used)
-      "vld1.8      {d25}, [%6]                   \n"  // load kRGBToV
-      "vmovl.s8    q14, d24                      \n"  // U coeffs in d28
-      "vmovl.s8    q15, d25                      \n"  // V coeffs in d30
-      "vmov.u16    q11, #0x8000                  \n"  // 128.0 bias
+  asm volatile (
+      "vld1.8      {d18}, [%5]                   \n"  // load kRGBToU
+      "vld1.8      {d19}, [%6]                   \n"  // load kRGBToV
+      "vmovl.s8    q8, d18                       \n"  // U coeffs in q8 (d16, d17)
+      "vmovl.s8    q9, d19                       \n"  // V coeffs in q9 (d18, d19)
+      "vdup.16     q10, d16[0]                   \n"  // U0
+      "vdup.16     q11, d16[1]                   \n"  // U1
+      "vdup.16     q12, d16[2]                   \n"  // U2
+      "vdup.16     q13, d18[0]                   \n"  // V0
+      "vdup.16     q14, d18[1]                   \n"  // V1
+      "vdup.16     q15, d18[2]                   \n"  // V2

      "1:          \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
-                                                      // pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
-      "vpaddl.u8   q3, q3                        \n"  // A 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more pixels.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"
-      "vpadal.u8   q0, q4                        \n"  // B
-      "vpadal.u8   q1, q5                        \n"  // G
-      "vpadal.u8   q2, q6                        \n"  // R
-      "vpadal.u8   q3, q7                        \n"  // A
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ARGB pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ARGB pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.

      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
      "vrshr.u16   q1, q1, #2                    \n"
      "vrshr.u16   q2, q2, #2                    \n"
-      "vrshr.u16   q3, q3, #2                    \n"

-      "vdup.16     q12, d28[0]                   \n"
-      "vmul.s16    q8, q0, q12                   \n"  // U = B * U0
-      "vdup.16     q12, d28[1]                   \n"
-      "vmla.s16    q8, q1, q12                   \n"  // U += G * U1
-      "vdup.16     q12, d28[2]                   \n"
+      "vmov.u16    q3, #0x8000                   \n"  // 128.0
+
+      "vmul.s16    q8, q0, q10                   \n"  // U = B * U0
+      "vmla.s16    q8, q1, q11                   \n"  // U += G * U1
      "vmla.s16    q8, q2, q12                   \n"  // U += R * U2
-      "vdup.16     q12, d28[3]                   \n"
-      "vmla.s16    q8, q3, q12                   \n"  // U += A * U3

-      "vdup.16     q12, d30[0]                   \n"
-      "vmul.s16    q9, q0, q12                   \n"  // V = B * V0
-      "vdup.16     q12, d30[1]                   \n"
-      "vmla.s16    q9, q1, q12                   \n"  // V += G * V1
-      "vdup.16     q12, d30[2]                   \n"
-      "vmla.s16    q9, q2, q12                   \n"  // V += R * V2
-      "vdup.16     q12, d30[3]                   \n"
-      "vmla.s16    q9, q3, q12                   \n"  // V += A * V3
+      "vmul.s16    q9, q0, q13                   \n"  // V = B * V0
+      "vmla.s16    q9, q1, q14                   \n"  // V += G * V1
+      "vmla.s16    q9, q2, q15                   \n"  // V += R * V2

-      "vsubhn.s16  d0, q11, q8                   \n"  // 128.0 - U
-      "vsubhn.s16  d1, q11, q9                   \n"  // 128.0 - V
+      "vsubhn.s16  d0, q3, q8                    \n"  // 128.0 - U
+      "vsubhn.s16  d1, q3, q9                    \n"  // 128.0 - V

      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
      "bgt         1b                            \n"
-      : "+r"(src_argb),    // %0
-        "+r"(src_argb_1),  // %1
-        "+r"(dst_u),       // %2
-        "+r"(dst_v),       // %3
-        "+r"(width)        // %4
-      : "r"(&c->kRGBToU),  // %5
-        "r"(&c->kRGBToV)   // %6
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q9", "q11", "q12", "q14", "q15");
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  : "r"(&c->kRGBToU),  // %5
+    "r"(&c->kRGBToV)   // %6
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
 }

 void ARGBToUVRow_NEON(const uint8_t* src_argb,
@ -2226,8 +2212,44 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  ARGBToUVMatrixRow_NEON(src_bgra, src_stride_bgra, dst_u, dst_v, width,
-                         &kBgraI601Constants);
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_bgra
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8000                  \n"  // 128.0
+      "1:          \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 BGRA pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 BGRA pixels.
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      "vpaddl.u8   q3, q3                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more BGRA pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 BGRA pixels.
+      "vpadal.u8   q3, q7                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q1, q1, #2                    \n"  // average of 4
+      "vrshr.u16   q2, q2, #2                    \n"
+      "vrshr.u16   q3, q3, #2                    \n"
+
+    RGBTOUV(q3, q2, q1)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_stride_bgra),  // %1
+    "+r"(dst_u),     // %2-
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
 }

 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
@ -2235,8 +2257,44 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_u, dst_v, width,
-                         &kAbgrI601Constants);
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_abgr
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8000                  \n"  // 128.0
+      "1:          \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ABGR pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ABGR pixels.
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ABGR pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ABGR pixels.
+      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q1, #2                    \n"
+      "vrshr.u16   q2, q2, #2                    \n"
+
+    RGBTOUV(q2, q1, q0)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_stride_abgr),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
 }

 void RGBAToUVRow_NEON(const uint8_t* src_rgba,
@ -2244,8 +2302,44 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  ARGBToUVMatrixRow_NEON(src_rgba, src_stride_rgba, dst_u, dst_v, width,
-                         &kRgbaI601Constants);
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_rgba
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
+      "vmov.u16    q15, #0x8000                  \n"  // 128.0
+      "1:          \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 RGBA pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 RGBA pixels.
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      "vpaddl.u8   q0, q1                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q2                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q3                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more RGBA pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 RGBA pixels.
+      "vpadal.u8   q0, q5                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q6                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q7                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q1, #2                    \n"
+      "vrshr.u16   q2, q2, #2                    \n"
+
+    RGBTOUV(q0, q1, q2)
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_stride_rgba),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
 }

 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
@ -2703,20 +2797,19 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,

 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
-                           uint8_t* dst_y,
-                           int width,
-                           const struct ArgbConstants* c) {
+                            uint8_t* dst_y,
+                            int width,
+                            const struct ArgbConstants* c) {
  asm volatile(
-      "vld1.8      {d24}, [%3]                   \n"  // load kRGBToY
-      "vld1.16     {d25[0]}, [%4]                \n"  // load kAddY[0]
-      "vdup.8      d20, d24[0]                   \n"  // B
-      "vdup.8      d21, d24[1]                   \n"  // G
-      "vdup.8      d22, d24[2]                   \n"  // R
-      "vdup.8      d23, d24[3]                   \n"  // A
-      "vdup.16     q12, d25[0]                   \n"  // bias
+      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
+      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
+      "vdup.8      d20, d16[0]                   \n"  // BY
+      "vdup.8      d21, d16[1]                   \n"  // GY
+      "vdup.8      d22, d16[2]                   \n"  // RY
+      "vdup.16     q12, d18[0]                   \n"  // AY
      "1:          \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 pixels
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of ARGB
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
      "subs        %1, %1, #16                   \n"  // 16 processed per loop.
      "vmull.u8    q8, d0, d20                   \n"  // B
      "vmull.u8    q9, d1, d20                   \n"
@ -2724,8 +2817,6 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
      "vmlal.u8    q9, d3, d21                   \n"
      "vmlal.u8    q8, d4, d22                   \n"  // R
      "vmlal.u8    q9, d5, d22                   \n"
-      "vmlal.u8    q8, d6, d23                   \n"  // A
-      "vmlal.u8    q9, d7, d23                   \n"
      "vaddhn.u16  d0, q8, q12                   \n"  // 16 bit to 8 bit Y
      "vaddhn.u16  d1, q9, q12                   \n"
      "vst1.8      {d0, d1}, [%2]!               \n"  // store 16 pixels Y.
@ -2735,8 +2826,8 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
        "+r"(dst_y)        // %2
      : "r"(&c->kRGBToY),  // %3
        "r"(&c->kAddY)     // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12",
-        "d24", "d25");
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+        "q12");
 }

 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -2755,33 +2846,65 @@ void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
 }

+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
+                                  uint8_t* dst_y,
+                                  int width,
+                                  const struct ArgbConstants* c) {
+  asm volatile(
+      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
+      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
+      "vdup.8      d20, d16[0]                   \n"  // BY
+      "vdup.8      d21, d16[1]                   \n"  // GY
+      "vdup.8      d22, d16[2]                   \n"  // RY
+      "vdup.16     q12, d18[0]                   \n"  // AY
+      "1:          \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of RGBA
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
+      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
+      "vmull.u8    q8, d2, d20                   \n"  // B
+      "vmull.u8    q9, d3, d20                   \n"
+      "vmlal.u8    q8, d4, d21                   \n"  // G
+      "vmlal.u8    q9, d5, d21                   \n"
+      "vmlal.u8    q8, d6, d22                   \n"  // R
+      "vmlal.u8    q9, d7, d22                   \n"
+      "vaddhn.u16  d0, q8, q12                   \n"  // 16 bit to 8 bit Y
+      "vaddhn.u16  d1, q9, q12                   \n"
+      "vst1.8      {d0, d1}, [%1]!               \n"  // store 16 pixels Y.
+      "bgt         1b                            \n"
+      : "+r"(src_rgba),    // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      : "r"(&c->kRGBToY),  // %3
+        "r"(&c->kAddY)     // %4
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+        "q12");
+}
+
 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  ARGBToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgbaI601Constants);
+  RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kArgbI601Constants);
 }

 void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
-  ARGBToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
+  RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kArgbJPEGConstants);
 }

 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  ARGBToYMatrixRow_NEON(src_bgra, dst_y, width, &kBgraI601Constants);
-}
-
-void BGRAToYJRow_NEON(const uint8_t* src_bgra, uint8_t* dst_yj, int width) {
-  ARGBToYMatrixRow_NEON(src_bgra, dst_yj, width, &kBgraJPEGConstants);
+  RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kAbgrI601Constants);
 }

 void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c) {
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct ArgbConstants* c) {
  asm volatile(
-      "vld1.8      {d24}, [%3]                   \n"  // load kRGBToY
-      "vld1.16     {d25[0]}, [%4]                \n"  // load kAddY[0]
-      "vdup.8      d20, d24[0]                   \n"  // BY
-      "vdup.8      d21, d24[1]                   \n"  // GY
-      "vdup.8      d22, d24[2]                   \n"  // RY
-      "vdup.16     q12, d25[0]                   \n"  // AY
+      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
+      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
+      "vdup.8      d20, d16[0]                   \n"  // BY
+      "vdup.8      d21, d16[1]                   \n"  // GY
+      "vdup.8      d22, d16[2]                   \n"  // RY
+      "vdup.16     q12, d18[0]                   \n"  // AY
      "1:          \n"
      "vld3.8      {d2, d4, d6}, [%0]!           \n"  // load 16 pixels of
                                                      // RGB24.
@ -2802,10 +2925,14 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
        "+r"(width)        // %2
      : "r"(&c->kRGBToY),  // %3
        "r"(&c->kAddY)     // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12",
-        "d24", "d25");
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+        "q12");
 }

+
+
+
+
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_NEON(uint8_t* dst_ptr,
                         const uint8_t* src_ptr,
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@ -1249,22 +1249,16 @@ void MergeUVRow_RVV(const uint8_t* src_u,
 }
 #endif

+
+
 // RGB to JPeg coefficients
 // B * 0.1140 coefficient = 29
 // G * 0.5870 coefficient = 150
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
-static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
-                                                         {0},
-                                                         {0},
-                                                         {128},
-                                                         {0}};
+static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {0}, {0}, {128}, {0}};

-static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
-                                                       {0},
-                                                       {0},
-                                                       {128},
-                                                       {0}};
+static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {0}, {0}, {128}, {0}};

 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@ -1272,24 +1266,16 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
 // R * 0.2578 coefficient = 66
 // Add 16.5 = 0x1080

-static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                         {0},
-                                                         {0},
-                                                         {0x1080},
-                                                         {0}};
+static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {0}, {0}, {0x1080}, {0}};

-static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0},
-                                                       {0},
-                                                       {0},
-                                                       {0x1080},
-                                                       {0}};
+static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {0}, {0}, {0x1080}, {0}};

 // ARGB expects first 3 values to contain RGB and 4th value is ignored
 #ifdef HAS_ARGBTOYMATRIXROW_RVV
 void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c) {
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct ArgbConstants* c) {
  assert(width != 0);
  size_t w = (size_t)width;
  vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@ -1127,10 +1127,9 @@ __arm_locally_streaming void ARGBToUVMatrixRow_SME(
    uint8_t* dst_v,
    int width,
    const struct ArgbConstants* c) {
-  int8_t uvconstants[8] = {(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1],
-                           (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
-                           (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1],
-                           (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
+  int8_t uvconstants[8] = {
+      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
+      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
  ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
                           uvconstants);
 }
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@ -223,10 +223,9 @@ void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
                            uint8_t* dst_v,
                            int width,
                            const struct ArgbConstants* c) {
-  int8_t uvconstants[8] = {(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1],
-                           (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
-                           (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1],
-                           (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
+  int8_t uvconstants[8] = {
+      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
+      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
  ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
                           uvconstants);
 }
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -8,19 +8,19 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "libyuv/convert_from_argb.h"  // For ArgbConstants
 #include "libyuv/row.h"
+#include "libyuv/convert_from_argb.h"  // For ArgbConstants

 // This module is for Visual C 32/64 bit
-#if !defined(LIBYUV_DISABLE_X86) &&                                 \
-    (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || \
-     defined(_M_X86)) &&                                            \
-    ((defined(_MSC_VER) && !defined(__clang__)) ||                  \
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || defined(__i386__) || \
+     defined(_M_X64) || defined(_M_X86)) && \
+    ((defined(_MSC_VER) && !defined(__clang__)) || \
     defined(LIBYUV_ENABLE_ROWWIN))

 #include <emmintrin.h>
-#include <immintrin.h>  // For AVX2 intrinsics
 #include <tmmintrin.h>  // For _mm_maddubs_epi16
+#include <immintrin.h>  // For AVX2 intrinsics

 #ifdef __cplusplus
 namespace libyuv {
@ -102,91 +102,42 @@ extern "C" {
  _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
  dst_argb += 32;

-#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
+#if defined(HAS_I422TOARGBROW_SSSE3)
+
+#endif
+
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+
+#endif
+
+#if defined(HAS_I444TOARGBROW_SSSE3)
+
+#endif
+
+#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
+
+#endif
+
+#if defined(HAS_ARGBTOYROW_AVX2)

 #if defined(__clang__) || defined(__GNUC__)
 #define LIBYUV_TARGET_AVX2 __attribute__((target("avx2")))
-#define LIBYUV_TARGET_AVX512BW \
-  __attribute__((target("avx512bw,avx512vl,avx512f")))
+#define LIBYUV_TARGET_AVX512BW __attribute__((target("avx512bw,avx512vl,avx512f")))
 #else
 #define LIBYUV_TARGET_AVX2
 #define LIBYUV_TARGET_AVX512BW
 #endif

-// Convert 32 ARGB pixels (128 bytes) to 32 UV444 values.
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2) || defined(HAS_ARGBTOUV444MATRIXROW_AVX2)
-LIBYUV_TARGET_AVX2
-void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
-                               uint8_t* dst_u,
-                               uint8_t* dst_v,
-                               int width,
-                               const struct ArgbConstants* c) {
-  __m256i ymm_u =
-      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU));
-  __m256i ymm_v =
-      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV));
-  __m256i ymm5 = _mm256_set1_epi16((short)0x8000);
-  __m256i perm_mask = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
-
-  while (width > 0) {
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb);
-    __m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
-    __m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + 64));
-    __m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + 96));
-    src_argb += 128;
-
-    __m256i ymm0_u = _mm256_maddubs_epi16(ymm0, ymm_u);
-    __m256i ymm1_u = _mm256_maddubs_epi16(ymm1, ymm_u);
-    __m256i ymm2_u = _mm256_maddubs_epi16(ymm2, ymm_u);
-    __m256i ymm3_u = _mm256_maddubs_epi16(ymm3, ymm_u);
-
-    __m256i ymm0_v = _mm256_maddubs_epi16(ymm0, ymm_v);
-    __m256i ymm1_v = _mm256_maddubs_epi16(ymm1, ymm_v);
-    __m256i ymm2_v = _mm256_maddubs_epi16(ymm2, ymm_v);
-    __m256i ymm3_v = _mm256_maddubs_epi16(ymm3, ymm_v);
-
-    ymm0_u = _mm256_hadd_epi16(ymm0_u, ymm1_u);
-    ymm2_u = _mm256_hadd_epi16(ymm2_u, ymm3_u);
-
-    ymm0_v = _mm256_hadd_epi16(ymm0_v, ymm1_v);
-    ymm2_v = _mm256_hadd_epi16(ymm2_v, ymm3_v);
-
-    ymm0_u = _mm256_sub_epi16(ymm5, ymm0_u);
-    ymm2_u = _mm256_sub_epi16(ymm5, ymm2_u);
-
-    ymm0_v = _mm256_sub_epi16(ymm5, ymm0_v);
-    ymm2_v = _mm256_sub_epi16(ymm5, ymm2_v);
-
-    ymm0_u = _mm256_srli_epi16(ymm0_u, 8);
-    ymm2_u = _mm256_srli_epi16(ymm2_u, 8);
-
-    ymm0_v = _mm256_srli_epi16(ymm0_v, 8);
-    ymm2_v = _mm256_srli_epi16(ymm2_v, 8);
-
-    ymm0_u = _mm256_packus_epi16(ymm0_u, ymm2_u);
-    ymm0_u = _mm256_permutevar8x32_epi32(ymm0_u, perm_mask);
-
-    ymm0_v = _mm256_packus_epi16(ymm0_v, ymm2_v);
-    ymm0_v = _mm256_permutevar8x32_epi32(ymm0_v, perm_mask);
-
-    _mm256_storeu_si256((__m256i*)dst_u, ymm0_u);
-    _mm256_storeu_si256((__m256i*)dst_v, ymm0_v);
-    dst_u += 32;
-    dst_v += 32;
-    width -= 32;
-  }
-}
-#endif
 LIBYUV_TARGET_AVX2
 void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
                           uint8_t* dst_y,
                           int width,
                           const struct ArgbConstants* c) {
  __m256i ymm5 = _mm256_set1_epi8((char)0x80);
-  __m256i ymm4 =
-      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToY));
-  __m256i ymm7 =
-      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kAddY));
+  __m128i kRGBToY = _mm_loadu_si128((const __m128i*)c->kRGBToY);
+  __m256i ymm4 = _mm256_broadcastsi128_si256(kRGBToY);
+  __m128i kAddY = _mm_loadu_si128((const __m128i*)c->kAddY);
+  __m256i ymm7 = _mm256_broadcastsi128_si256(kAddY);
  __m256i ymm6 = _mm256_maddubs_epi16(ymm4, ymm5);
  ymm6 = _mm256_hadd_epi16(ymm6, ymm6);
  ymm7 = _mm256_sub_epi16(ymm7, ymm6);
@ -266,33 +217,27 @@ void BGRAToYRow_AVX2(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
 LIBYUV_TARGET_AVX2
 void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
  __m256i ymm_alpha = _mm256_set1_epi32(0xff000000);
-  __m128i shuf_low =
-      _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
-  __m128i shuf_high =
-      _mm_set_epi8(-1, 13, 14, 15, -1, 10, 11, 12, -1, 7, 8, 9, -1, 4, 5, 6);
+  __m128i shuf_low = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
+  __m128i shuf_high = _mm_set_epi8(-1, 13, 14, 15, -1, 10, 11, 12, -1, 7, 8, 9, -1, 4, 5, 6);
  __m256i ymm_shuf = _mm256_broadcastsi128_si256(shuf_low);
  __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(shuf_high);

  while (width > 0) {
    __m128i xmm0 = _mm_loadu_si128((const __m128i*)src_raw);
    __m256i ymm0 = _mm256_castsi128_si256(xmm0);
-    ymm0 = _mm256_inserti128_si256(
-        ymm0, _mm_loadu_si128((const __m128i*)(src_raw + 12)), 1);
+    ymm0 = _mm256_inserti128_si256(ymm0, _mm_loadu_si128((const __m128i*)(src_raw + 12)), 1);

    __m128i xmm1 = _mm_loadu_si128((const __m128i*)(src_raw + 24));
    __m256i ymm1 = _mm256_castsi128_si256(xmm1);
-    ymm1 = _mm256_inserti128_si256(
-        ymm1, _mm_loadu_si128((const __m128i*)(src_raw + 36)), 1);
+    ymm1 = _mm256_inserti128_si256(ymm1, _mm_loadu_si128((const __m128i*)(src_raw + 36)), 1);

    __m128i xmm2 = _mm_loadu_si128((const __m128i*)(src_raw + 48));
    __m256i ymm2 = _mm256_castsi128_si256(xmm2);
-    ymm2 = _mm256_inserti128_si256(
-        ymm2, _mm_loadu_si128((const __m128i*)(src_raw + 60)), 1);
+    ymm2 = _mm256_inserti128_si256(ymm2, _mm_loadu_si128((const __m128i*)(src_raw + 60)), 1);

    __m128i xmm3 = _mm_loadu_si128((const __m128i*)(src_raw + 68));
    __m256i ymm3 = _mm256_castsi128_si256(xmm3);
-    ymm3 = _mm256_inserti128_si256(
-        ymm3, _mm_loadu_si128((const __m128i*)(src_raw + 80)), 1);
+    ymm3 = _mm256_inserti128_si256(ymm3, _mm_loadu_si128((const __m128i*)(src_raw + 80)), 1);

    ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
    ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
@ -318,13 +263,10 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {

 #ifdef HAS_RAWTOARGBROW_AVX512BW
 LIBYUV_TARGET_AVX512BW
-void RGBToARGBRow_AVX512BW(const uint8_t* src_raw,
-                           uint8_t* dst_argb,
-                           const __m128i* shuffler,
-                           int width) {
+void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const __m128i* shuffler, int width) {
  __m512i zmm_alpha = _mm512_set1_epi32(0xff000000);
-  __m512i zmm_perm =
-      _mm512_set_epi32(12, 11, 10, 9, 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 1, 0);
+  __m512i zmm_perm = _mm512_set_epi32(
+      12, 11, 10, 9, 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 1, 0);
  __m512i zmm_shuf = _mm512_broadcast_i32x4(_mm_loadu_si128(shuffler));

  while (width > 0) {
@ -360,26 +302,20 @@ void RGBToARGBRow_AVX512BW(const uint8_t* src_raw,
 }

 LIBYUV_TARGET_AVX512BW
-void RAWToARGBRow_AVX512BW(const uint8_t* src_raw,
-                           uint8_t* dst_argb,
-                           int width) {
-  __m128i shuf =
-      _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
+void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  __m128i shuf = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
  RGBToARGBRow_AVX512BW(src_raw, dst_argb, &shuf, width);
 }

 LIBYUV_TARGET_AVX512BW
-void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24,
-                             uint8_t* dst_argb,
-                             int width) {
-  __m128i shuf =
-      _mm_set_epi8(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0);
+void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
+  __m128i shuf = _mm_set_epi8(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0);
  RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, &shuf, width);
 }
 #endif

 #ifdef HAS_ARGBTOUVMATRIXROW_AVX2
-LIBYUV_TARGET_AVX2
+LIBYUV_TARGET_AVX2 __attribute__((no_sanitize("cfi-icall")))
 void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
                            int src_stride_argb,
                            uint8_t* dst_u,
@ -389,19 +325,16 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
  __m256i ymm_u = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU));
  __m256i ymm_v = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV));
  __m256i ymm_0101 = _mm256_set1_epi16(0x0101);
-  __m256i ymm_shuf =
-      _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15, 0,
-                       4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
+  __m256i ymm_shuf = _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
+                                      0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
  __m256i ymm_8000 = _mm256_set1_epi16((short)0x8000);
  __m256i ymm_zero = _mm256_setzero_si256();

  while (width > 0) {
    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb);
    __m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
-    __m256i ymm2 =
-        _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb));
-    __m256i ymm3 =
-        _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32));
+    __m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb));
+    __m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32));

    ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
    ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
@ -470,515 +403,12 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
 }
 #endif

-#ifdef HAS_MIRRORROW_AVX2
-LIBYUV_TARGET_AVX2
-void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
-      _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
-  src += width;
-  while (width > 0) {
-    src -= 32;
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src);
-    ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
-    ymm0 = _mm256_permute4x64_epi64(ymm0, 0x4e);
-    _mm256_storeu_si256((__m256i*)dst, ymm0);
-    dst += 32;
-    width -= 32;
-  }
-}
 #endif

-#ifdef HAS_MIRRORUVROW_AVX2
-LIBYUV_TARGET_AVX2
-void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
-      _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
-  src_uv += width * 2;
-  while (width > 0) {
-    src_uv -= 32;
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_uv);
-    ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
-    ymm0 = _mm256_permute4x64_epi64(ymm0, 0x4e);
-    _mm256_storeu_si256((__m256i*)dst_uv, ymm0);
-    dst_uv += 32;
-    width -= 16;
-  }
-}
-#endif
-
-#ifdef HAS_MIRRORSPLITUVROW_AVX2
-LIBYUV_TARGET_AVX2
-void MirrorSplitUVRow_AVX2(const uint8_t* src_uv,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width) {
-  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
-      _mm_setr_epi8(14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1));
-  src_uv += width * 2;
-  while (width > 0) {
-    src_uv -= 32;
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_uv);
-    ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
-    ymm0 = _mm256_permute4x64_epi64(ymm0, 0x72);
-    _mm_storeu_si128((__m128i*)dst_u, _mm256_castsi256_si128(ymm0));
-    _mm_storeu_si128((__m128i*)dst_v, _mm256_extracti128_si256(ymm0, 1));
-    dst_u += 16;
-    dst_v += 16;
-    width -= 16;
-  }
-}
-#endif
-
-#ifdef HAS_RGB24MIRRORROW_AVX2
-LIBYUV_TARGET_AVX2
-void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
-                         uint8_t* dst_rgb24,
-                         int width) {
-  __m256i shuf0 =
-      _mm256_setr_epi8(-1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2, -1,
-                       12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2);
-  __m128i shuf1 =
-      _mm_setr_epi8(13, 14, 15, 10, 11, 12, 7, 8, 9, 4, 5, 6, 1, 2, 3, -1);
-
-  src_rgb24 += width * 3 - 96;
-  while (width > 0) {
-    __m128i v0_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 0));
-    __m128i v0_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 15));
-    __m256i v0 =
-        _mm256_inserti128_si256(_mm256_castsi128_si256(v0_lo), v0_hi, 1);
-
-    __m128i v1_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 30));
-    __m128i v1_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 45));
-    __m256i v1 =
-        _mm256_inserti128_si256(_mm256_castsi128_si256(v1_lo), v1_hi, 1);
-
-    __m128i v2_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 60));
-    __m128i v2_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 75));
-    __m256i v2 =
-        _mm256_inserti128_si256(_mm256_castsi128_si256(v2_lo), v2_hi, 1);
-
-    __m128i v3 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 80));
-
-    v0 = _mm256_shuffle_epi8(v0, shuf0);
-    v1 = _mm256_shuffle_epi8(v1, shuf0);
-    v2 = _mm256_shuffle_epi8(v2, shuf0);
-    v3 = _mm_shuffle_epi8(v3, shuf1);
-
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 80), _mm256_castsi256_si128(v0));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 65),
-                     _mm256_extracti128_si256(v0, 1));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 50), _mm256_castsi256_si128(v1));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 35),
-                     _mm256_extracti128_si256(v1, 1));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 20), _mm256_castsi256_si128(v2));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 5),
-                     _mm256_extracti128_si256(v2, 1));
-    _mm_storel_epi64((__m128i*)(dst_rgb24 + 0), v3);
-
-    src_rgb24 -= 96;
-    dst_rgb24 += 96;
-    width -= 32;
-  }
-}
-#endif
-
-#ifdef HAS_INTERPOLATEROW_AVX2
-LIBYUV_TARGET_AVX2
-void InterpolateRow_AVX2(uint8_t* dst_ptr,
-                         const uint8_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         int width,
-                         int source_y_fraction) {
-  int y1 = source_y_fraction;
-  int y0 = 256 - y1;
-  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  __m256i ymm_y = _mm256_set1_epi16((y1 << 8) | y0);
-  __m256i ymm_8080 = _mm256_set1_epi16(0x8080);
-  int i;
-
-  if (y1 == 0) {
-    for (i = 0; i < width; i += 32) {
-      _mm256_storeu_si256((__m256i*)(dst_ptr + i),
-                          _mm256_loadu_si256((const __m256i*)(src_ptr + i)));
-    }
-  } else if (y1 == 128) {
-    for (i = 0; i < width; i += 32) {
-      __m256i row0 = _mm256_loadu_si256((const __m256i*)(src_ptr + i));
-      __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_ptr1 + i));
-      _mm256_storeu_si256((__m256i*)(dst_ptr + i), _mm256_avg_epu8(row0, row1));
-    }
-  } else {
-    for (i = 0; i < width; i += 32) {
-      __m256i row0 = _mm256_loadu_si256((const __m256i*)(src_ptr + i));
-      __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_ptr1 + i));
-      __m256i low = _mm256_unpacklo_epi8(row0, row1);
-      __m256i high = _mm256_unpackhi_epi8(row0, row1);
-      low = _mm256_sub_epi8(low, ymm_8080);
-      high = _mm256_sub_epi8(high, ymm_8080);
-      low = _mm256_maddubs_epi16(ymm_y, low);
-      high = _mm256_maddubs_epi16(ymm_y, high);
-      low = _mm256_add_epi16(low, ymm_8080);
-      high = _mm256_add_epi16(high, ymm_8080);
-      low = _mm256_srli_epi16(low, 8);
-      high = _mm256_srli_epi16(high, 8);
-      _mm256_storeu_si256((__m256i*)(dst_ptr + i),
-                          _mm256_packus_epi16(low, high));
-    }
-  }
-  _mm256_zeroupper();
-}
-#endif
-
-#ifdef HAS_INTERPOLATEROW_16_AVX2
-LIBYUV_TARGET_AVX2
-void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
-                            const uint16_t* src_ptr,
-                            ptrdiff_t src_stride,
-                            int width,
-                            int source_y_fraction) {
-  int y1 = source_y_fraction;
-  int y0 = 256 - y1;
-  const uint16_t* src_ptr1 = src_ptr + src_stride;
-  __m256i ymm_y = _mm256_set1_epi32((y1 << 16) | y0);
-  __m256i ymm_8000 = _mm256_set1_epi16((short)0x8000);
-  __m256i ymm_round = _mm256_set1_epi32(8388736);  // 0x800000 + 128
-  int i;
-
-  if (y1 == 0) {
-    for (i = 0; i < width; i += 16) {
-      _mm256_storeu_si256((__m256i*)(dst_ptr + i),
-                          _mm256_loadu_si256((const __m256i*)(src_ptr + i)));
-    }
-  } else if (y1 == 128) {
-    for (i = 0; i < width; i += 16) {
-      __m256i row0 = _mm256_loadu_si256((const __m256i*)(src_ptr + i));
-      __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_ptr1 + i));
-      _mm256_storeu_si256((__m256i*)(dst_ptr + i),
-                          _mm256_avg_epu16(row0, row1));
-    }
-  } else {
-    for (i = 0; i < width; i += 16) {
-      __m256i row0 = _mm256_loadu_si256((const __m256i*)(src_ptr + i));
-      __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_ptr1 + i));
-      __m256i row0l = _mm256_unpacklo_epi16(row0, row1);
-      __m256i row0h = _mm256_unpackhi_epi16(row0, row1);
-      row0l = _mm256_sub_epi16(row0l, ymm_8000);
-      row0h = _mm256_sub_epi16(row0h, ymm_8000);
-      __m256i resl = _mm256_madd_epi16(row0l, ymm_y);
-      __m256i resh = _mm256_madd_epi16(row0h, ymm_y);
-      resl = _mm256_add_epi32(resl, ymm_round);
-      resh = _mm256_add_epi32(resh, ymm_round);
-      resl = _mm256_srai_epi32(resl, 8);
-      resh = _mm256_srai_epi32(resh, 8);
-      _mm256_storeu_si256((__m256i*)(dst_ptr + i),
-                          _mm256_packus_epi32(resl, resh));
-    }
-  }
-  _mm256_zeroupper();
-}
-#endif
-
-#ifdef HAS_ARGBMIRRORROW_AVX2
-LIBYUV_TARGET_AVX2
-void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  __m256i ymm_shuf = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-  src += width * 4;
-  while (width > 0) {
-    src -= 32;
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src);
-    ymm0 = _mm256_permutevar8x32_epi32(ymm0, ymm_shuf);
-    _mm256_storeu_si256((__m256i*)dst, ymm0);
-    dst += 32;
-    width -= 8;
-  }
-}
-#endif
-
-#ifdef HAS_J400TOARGBROW_AVX2
-alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_0[32] = {
-    0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, 3u, 128u,
-    4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u};
-alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_1[32] = {
-    8u,   8u,   8u,  128u, 9u,   9u,   9u,  128u, 10u,  10u, 10u,
-    128u, 11u,  11u, 11u,  128u, 12u,  12u, 12u,  128u, 13u, 13u,
-    13u,  128u, 14u, 14u,  14u,  128u, 15u, 15u,  15u,  128u};
-
-LIBYUV_TARGET_AVX2
-void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  __m256i ymm_mask0 =
-      _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_0);
-  __m256i ymm_mask1 =
-      _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_1);
-  __m256i ymm_alpha = _mm256_set1_epi32((int)0xff000000u);
-
-  while (width > 0) {
-    __m256i ymm0 =
-        _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)src_y));
-
-    __m256i ymm1 = _mm256_shuffle_epi8(ymm0, ymm_mask0);
-    __m256i ymm2 = _mm256_shuffle_epi8(ymm0, ymm_mask1);
-
-    ymm1 = _mm256_or_si256(ymm1, ymm_alpha);
-    ymm2 = _mm256_or_si256(ymm2, ymm_alpha);
-
-    _mm256_storeu_si256((__m256i*)dst_argb, ymm1);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm2);
-
-    src_y += 16;
-    dst_argb += 64;
-    width -= 16;
-  }
-}
-#endif  // HAS_J400TOARGBROW_AVX2
-
-#ifdef HAS_RGB24TOARGBROW_AVX2
-alignas(16) static const uint8_t kShuffleMaskRGB24ToARGB[2][16] = {
-    {0u, 1u, 2u, 128u, 3u, 4u, 5u, 128u, 6u, 7u, 8u, 128u, 9u, 10u, 11u, 128u},
-    {4u, 5u, 6u, 128u, 7u, 8u, 9u, 128u, 10u, 11u, 12u, 128u, 13u, 14u, 15u,
-     128u}};
-#endif
-
-#ifdef HAS_RGB565TOARGBROW_AVX2
-LIBYUV_TARGET_AVX2
-void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
-                          uint8_t* dst_argb,
-                          int width) {
-  __m256i ymm_scale_rb = _mm256_set1_epi32(0x01080108);
-  __m256i ymm_scale_g = _mm256_set1_epi32(0x20802080);
-  __m256i ymm_mask_b = _mm256_set1_epi16((short)0xf800);
-  __m256i ymm_mask_g = _mm256_set1_epi16(0x07e0);
-  __m256i ymm_mask_a = _mm256_set1_epi16((short)0xff00);
-
-  while (width > 0) {
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_rgb565);
-    __m256i ymm1 = ymm0;
-    __m256i ymm2 = ymm0;
-
-    ymm1 = _mm256_and_si256(ymm1, ymm_mask_b);
-    ymm2 = _mm256_slli_epi16(ymm2, 11);
-    ymm1 = _mm256_mulhi_epu16(ymm1, ymm_scale_rb);
-    ymm2 = _mm256_mulhi_epu16(ymm2, ymm_scale_rb);
-    ymm1 = _mm256_slli_epi16(ymm1, 8);
-    ymm1 = _mm256_or_si256(ymm1, ymm2);  // RB
-
-    ymm0 = _mm256_and_si256(ymm0, ymm_mask_g);
-    ymm0 = _mm256_mulhi_epu16(ymm0, ymm_scale_g);
-    ymm0 = _mm256_or_si256(ymm0, ymm_mask_a);  // GA
-
-    ymm2 = _mm256_unpacklo_epi8(ymm1, ymm0);
-    ymm1 = _mm256_unpackhi_epi8(ymm1, ymm0);
-
-    ymm0 = _mm256_permute2x128_si256(ymm2, ymm1, 0x20);
-    ymm1 = _mm256_permute2x128_si256(ymm2, ymm1, 0x31);
-
-    _mm256_storeu_si256((__m256i*)dst_argb, ymm0);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1);
-
-    src_rgb565 += 32;
-    dst_argb += 64;
-    width -= 16;
-  }
-  _mm256_zeroupper();
-}
-#endif
-
-#ifdef HAS_ARGB1555TOARGBROW_AVX2
-LIBYUV_TARGET_AVX2
-void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
-                            uint8_t* dst_argb,
-                            int width) {
-  __m256i ymm_scale_rb = _mm256_set1_epi32(0x01080108);
-  __m256i ymm_scale_g = _mm256_set1_epi32(0x42004200);
-  __m256i ymm_mask_b = _mm256_set1_epi16((short)0xf800);
-  __m256i ymm_mask_g = _mm256_set1_epi16(0x03e0);
-  __m256i ymm_mask_a = _mm256_set1_epi16((short)0xff00);
-
-  while (width > 0) {
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb1555);
-    __m256i ymm1 = ymm0;
-    __m256i ymm2 = ymm0;
-
-    ymm1 = _mm256_slli_epi16(ymm1, 1);
-    ymm2 = _mm256_slli_epi16(ymm2, 11);
-    ymm1 = _mm256_and_si256(ymm1, ymm_mask_b);
-    ymm2 = _mm256_mulhi_epu16(ymm2, ymm_scale_rb);
-    ymm1 = _mm256_mulhi_epu16(ymm1, ymm_scale_rb);
-    ymm1 = _mm256_slli_epi16(ymm1, 8);
-    ymm1 = _mm256_or_si256(ymm1, ymm2);  // RB
-
-    ymm2 = ymm0;
-    ymm0 = _mm256_and_si256(ymm0, ymm_mask_g);
-    ymm2 = _mm256_srai_epi16(ymm2, 8);
-    ymm0 = _mm256_mulhi_epu16(ymm0, ymm_scale_g);
-    ymm2 = _mm256_and_si256(ymm2, ymm_mask_a);
-    ymm0 = _mm256_or_si256(ymm0, ymm2);  // GA
-
-    ymm2 = _mm256_unpacklo_epi8(ymm1, ymm0);
-    ymm1 = _mm256_unpackhi_epi8(ymm1, ymm0);
-
-    ymm0 = _mm256_permute2x128_si256(ymm2, ymm1, 0x20);
-    ymm1 = _mm256_permute2x128_si256(ymm2, ymm1, 0x31);
-
-    _mm256_storeu_si256((__m256i*)dst_argb, ymm0);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1);
-
-    src_argb1555 += 32;
-    dst_argb += 64;
-    width -= 16;
-  }
-  _mm256_zeroupper();
-}
-#endif
-
-#ifdef HAS_ARGB4444TOARGBROW_AVX2
-LIBYUV_TARGET_AVX2
-void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
-                            uint8_t* dst_argb,
-                            int width) {
-  __m256i ymm_mask = _mm256_set1_epi32(0x0f0f0f0f);
-  __m256i ymm_mask2 = _mm256_slli_epi32(ymm_mask, 4);
-
-  while (width > 0) {
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb4444);
-    __m256i ymm2 = ymm0;
-
-    ymm0 = _mm256_and_si256(ymm0, ymm_mask);
-    ymm2 = _mm256_and_si256(ymm2, ymm_mask2);
-
-    __m256i ymm1 = ymm0;
-    __m256i ymm3 = ymm2;
-
-    ymm1 = _mm256_slli_epi16(ymm1, 4);
-    ymm3 = _mm256_srli_epi16(ymm3, 4);
-
-    ymm0 = _mm256_or_si256(ymm0, ymm1);
-    ymm2 = _mm256_or_si256(ymm2, ymm3);
-
-    ymm1 = ymm0;
-    ymm0 = _mm256_unpacklo_epi8(ymm0, ymm2);
-    ymm1 = _mm256_unpackhi_epi8(ymm1, ymm2);
-
-    ymm2 = _mm256_permute2x128_si256(ymm0, ymm1, 0x20);
-    ymm1 = _mm256_permute2x128_si256(ymm0, ymm1, 0x31);
-
-    _mm256_storeu_si256((__m256i*)dst_argb, ymm2);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1);
-
-    src_argb4444 += 32;
-    dst_argb += 64;
-    width -= 16;
-  }
-  _mm256_zeroupper();
-}
-#endif
-
-#ifdef HAS_RGB24TOARGBROW_AVX2
-LIBYUV_TARGET_AVX2
-void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24,
-                         uint8_t* dst_argb,
-                         int width) {
-  __m256i ymm_alpha = _mm256_set1_epi32(0xff000000);
-  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
-      _mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[0]));
-  __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(
-      _mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[1]));
-
-  while (width > 0) {
-    __m128i xmm0 = _mm_loadu_si128((const __m128i*)src_rgb24);
-    __m256i ymm0 = _mm256_castsi128_si256(xmm0);
-    ymm0 = _mm256_inserti128_si256(
-        ymm0, _mm_loadu_si128((const __m128i*)(src_rgb24 + 12)), 1);
-
-    __m128i xmm1 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 24));
-    __m256i ymm1 = _mm256_castsi128_si256(xmm1);
-    ymm1 = _mm256_inserti128_si256(
-        ymm1, _mm_loadu_si128((const __m128i*)(src_rgb24 + 36)), 1);
-
-    __m128i xmm2 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 48));
-    __m256i ymm2 = _mm256_castsi128_si256(xmm2);
-    ymm2 = _mm256_inserti128_si256(
-        ymm2, _mm_loadu_si128((const __m128i*)(src_rgb24 + 60)), 1);
-
-    __m128i xmm3 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 68));
-    __m256i ymm3 = _mm256_castsi128_si256(xmm3);
-    ymm3 = _mm256_inserti128_si256(
-        ymm3, _mm_loadu_si128((const __m128i*)(src_rgb24 + 80)), 1);
-
-    ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
-    ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
-    ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf);
-    ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf2);
-
-    ymm0 = _mm256_or_si256(ymm0, ymm_alpha);
-    ymm1 = _mm256_or_si256(ymm1, ymm_alpha);
-    ymm2 = _mm256_or_si256(ymm2, ymm_alpha);
-    ymm3 = _mm256_or_si256(ymm3, ymm_alpha);
-
-    _mm256_storeu_si256((__m256i*)dst_argb, ymm0);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 64), ymm2);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 96), ymm3);
-
-    src_rgb24 += 96;
-    dst_argb += 128;
-    width -= 32;
-  }
-  _mm256_zeroupper();
-}
-#endif
-
-#ifdef HAS_ARGBSHUFFLEROW_AVX2
-LIBYUV_TARGET_AVX2
-void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
-                         uint8_t* dst_argb,
-                         const uint8_t* shuffler,
-                         int width) {
-  __m256i control =
-      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)shuffler));
-  while (width >= 16) {
-    __m256i row = _mm256_loadu_si256((const __m256i*)src_argb);
-    __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
-    row = _mm256_shuffle_epi8(row, control);
-    row1 = _mm256_shuffle_epi8(row1, control);
-    _mm256_storeu_si256((__m256i*)dst_argb, row);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 32), row1);
-    src_argb += 64;
-    dst_argb += 64;
-    width -= 16;
-  }
-}
-#endif
-
-#ifdef HAS_ARGBSHUFFLEROW_AVX512BW
-LIBYUV_TARGET_AVX512BW
-void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb,
-                             uint8_t* dst_argb,
-                             const uint8_t* shuffler,
-                             int width) {
-  __m512i control =
-      _mm512_broadcast_i32x4(_mm_loadu_si128((const __m128i*)shuffler));
-  while (width >= 32) {
-    __m512i row = _mm512_loadu_si512((const __m512i*)src_argb);
-    __m512i row1 = _mm512_loadu_si512((const __m512i*)(src_argb + 64));
-    row = _mm512_shuffle_epi8(row, control);
-    row1 = _mm512_shuffle_epi8(row1, control);
-    _mm512_storeu_si512((__m512i*)dst_argb, row);
-    _mm512_storeu_si512((__m512i*)(dst_argb + 64), row1);
-    src_argb += 128;
-    dst_argb += 128;
-    width -= 32;
-  }
-}
-#endif
-
-#endif

 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif

-#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) ||
-        // defined(__i386__) || defined(_M_X64) || defined(_M_X86)) &&
-        // ((defined(_MSC_VER) && !defined(__clang__)) ||
-        // defined(LIBYUV_ENABLE_ROWWIN))
+#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_X86)) && ((defined(_MSC_VER) && !defined(__clang__)) || defined(LIBYUV_ENABLE_ROWWIN))
--- a/source/scale.cc
+++ b/source/scale.cc
@ -11,7 +11,6 @@
 #include "libyuv/scale.h"

 #include <assert.h>
-#include <limits.h>
 #include <string.h>

 #include "libyuv/cpu_id.h"
@ -40,8 +39,8 @@ static void ScalePlaneDown2(int src_width,
                            int src_height,
                            int dst_width,
                            int dst_height,
-                            ptrdiff_t src_stride,
-                            ptrdiff_t dst_stride,
+                            int src_stride,
+                            int dst_stride,
                            const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            enum FilterMode filtering) {
@ -52,7 +51,7 @@ static void ScalePlaneDown2(int src_width,
          ? ScaleRowDown2_C
          : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
                                        : ScaleRowDown2Box_C);
-  ptrdiff_t row_stride = src_stride * 2;
+  int row_stride = src_stride * 2;
  (void)src_width;
  (void)src_height;
  if (!filtering) {
@ -152,8 +151,8 @@ static void ScalePlaneDown2_16(int src_width,
                               int src_height,
                               int dst_width,
                               int dst_height,
-                               ptrdiff_t src_stride,
-                               ptrdiff_t dst_stride,
+                               int src_stride,
+                               int dst_stride,
                               const uint16_t* src_ptr,
                               uint16_t* dst_ptr,
                               enum FilterMode filtering) {
@ -164,7 +163,7 @@ static void ScalePlaneDown2_16(int src_width,
          ? ScaleRowDown2_16_C
          : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
                                        : ScaleRowDown2Box_16_C);
-  ptrdiff_t row_stride = src_stride * 2;
+  int row_stride = src_stride * 2;
  (void)src_width;
  (void)src_height;
  if (!filtering) {
@ -229,7 +228,7 @@ void ScalePlaneDown2_16To8(int src_width,
                 ? ScaleRowDown2_16To8_C
                 : (filtering == kFilterLinear ? ScaleRowDown2Linear_16To8_C
                                               : ScaleRowDown2Box_16To8_C));
-  ptrdiff_t row_stride = (ptrdiff_t)src_stride * 2;
+  int row_stride = src_stride * 2;
  (void)dst_height;
  if (!filtering) {
    src_ptr += src_stride;  // Point to odd rows.
@ -260,8 +259,8 @@ static void ScalePlaneDown4(int src_width,
                            int src_height,
                            int dst_width,
                            int dst_height,
-                            ptrdiff_t src_stride,
-                            ptrdiff_t dst_stride,
+                            int src_stride,
+                            int dst_stride,
                            const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            enum FilterMode filtering) {
@ -269,7 +268,7 @@ static void ScalePlaneDown4(int src_width,
  void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                        uint8_t* dst_ptr, int dst_width) =
      filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
-  ptrdiff_t row_stride = src_stride * 4;
+  int row_stride = src_stride * 4;
  (void)src_width;
  (void)src_height;
  if (!filtering) {
@ -332,8 +331,8 @@ static void ScalePlaneDown4_16(int src_width,
                               int src_height,
                               int dst_width,
                               int dst_height,
-                               ptrdiff_t src_stride,
-                               ptrdiff_t dst_stride,
+                               int src_stride,
+                               int dst_stride,
                               const uint16_t* src_ptr,
                               uint16_t* dst_ptr,
                               enum FilterMode filtering) {
@ -341,7 +340,7 @@ static void ScalePlaneDown4_16(int src_width,
  void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                        uint16_t* dst_ptr, int dst_width) =
      filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
-  ptrdiff_t row_stride = src_stride * 4;
+  int row_stride = src_stride * 4;
  (void)src_width;
  (void)src_height;
  if (!filtering) {
@ -376,8 +375,8 @@ static void ScalePlaneDown34(int src_width,
                             int src_height,
                             int dst_width,
                             int dst_height,
-                             ptrdiff_t src_stride,
-                             ptrdiff_t dst_stride,
+                             int src_stride,
+                             int dst_stride,
                             const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             enum FilterMode filtering) {
@ -386,7 +385,7 @@ static void ScalePlaneDown34(int src_width,
                           uint8_t* dst_ptr, int dst_width);
  void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                           uint8_t* dst_ptr, int dst_width);
-  const ptrdiff_t filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
  (void)src_width;
  (void)src_height;
  assert(dst_width % 3 == 0);
@ -503,8 +502,8 @@ static void ScalePlaneDown34_16(int src_width,
                                int src_height,
                                int dst_width,
                                int dst_height,
-                                ptrdiff_t src_stride,
-                                ptrdiff_t dst_stride,
+                                int src_stride,
+                                int dst_stride,
                                const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                enum FilterMode filtering) {
@ -513,7 +512,7 @@ static void ScalePlaneDown34_16(int src_width,
                           uint16_t* dst_ptr, int dst_width);
  void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                           uint16_t* dst_ptr, int dst_width);
-  const ptrdiff_t filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
  (void)src_width;
  (void)src_height;
  assert(dst_width % 3 == 0);
@ -589,8 +588,8 @@ static void ScalePlaneDown38(int src_width,
                             int src_height,
                             int dst_width,
                             int dst_height,
-                             ptrdiff_t src_stride,
-                             ptrdiff_t dst_stride,
+                             int src_stride,
+                             int dst_stride,
                             const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             enum FilterMode filtering) {
@ -599,7 +598,7 @@ static void ScalePlaneDown38(int src_width,
                           uint8_t* dst_ptr, int dst_width);
  void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                           uint8_t* dst_ptr, int dst_width);
-  const ptrdiff_t filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
  assert(dst_width % 3 == 0);
  (void)src_width;
  (void)src_height;
@ -709,8 +708,8 @@ static void ScalePlaneDown38_16(int src_width,
                                int src_height,
                                int dst_width,
                                int dst_height,
-                                ptrdiff_t src_stride,
-                                ptrdiff_t dst_stride,
+                                int src_stride,
+                                int dst_stride,
                                const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                enum FilterMode filtering) {
@ -719,7 +718,7 @@ static void ScalePlaneDown38_16(int src_width,
                           uint16_t* dst_ptr, int dst_width);
  void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                           uint16_t* dst_ptr, int dst_width);
-  const ptrdiff_t filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
  (void)src_width;
  (void)src_height;
  assert(dst_width % 3 == 0);
@ -902,8 +901,8 @@ static int ScalePlaneBox(int src_width,
                         int src_height,
                         int dst_width,
                         int dst_height,
-                         ptrdiff_t src_stride,
-                         ptrdiff_t dst_stride,
+                         int src_stride,
+                         int dst_stride,
                         const uint8_t* src_ptr,
                         uint8_t* dst_ptr) {
  int j, k;
@ -968,7 +967,7 @@ static int ScalePlaneBox(int src_width,
    for (j = 0; j < dst_height; ++j) {
      int boxheight;
      int iy = y >> 16;
-      const uint8_t* src = src_ptr + iy * src_stride;
+      const uint8_t* src = src_ptr + iy * (int64_t)src_stride;
      y += dy;
      if (y > max_y) {
        y = max_y;
@ -991,8 +990,8 @@ static int ScalePlaneBox_16(int src_width,
                            int src_height,
                            int dst_width,
                            int dst_height,
-                            ptrdiff_t src_stride,
-                            ptrdiff_t dst_stride,
+                            int src_stride,
+                            int dst_stride,
                            const uint16_t* src_ptr,
                            uint16_t* dst_ptr) {
  int j, k;
@ -1025,7 +1024,7 @@ static int ScalePlaneBox_16(int src_width,
    for (j = 0; j < dst_height; ++j) {
      int boxheight;
      int iy = y >> 16;
-      const uint16_t* src = src_ptr + iy * src_stride;
+      const uint16_t* src = src_ptr + iy * (int64_t)src_stride;
      y += dy;
      if (y > max_y) {
        y = max_y;
@ -1049,8 +1048,8 @@ static int ScalePlaneBilinearDown(int src_width,
                                  int src_height,
                                  int dst_width,
                                  int dst_height,
-                                  ptrdiff_t src_stride,
-                                  ptrdiff_t dst_stride,
+                                  int src_stride,
+                                  int dst_stride,
                                  const uint8_t* src_ptr,
                                  uint8_t* dst_ptr,
                                  enum FilterMode filtering) {
@ -1077,6 +1076,14 @@ static int ScalePlaneBilinearDown(int src_width,
             &dx, &dy);
  src_width = Abs(src_width);

+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    InterpolateRow = InterpolateRow_Any_AVX2;
@ -1139,7 +1146,7 @@ static int ScalePlaneBilinearDown(int src_width,

  for (j = 0; j < dst_height; ++j) {
    int yi = y >> 16;
-    const uint8_t* src = src_ptr + yi * src_stride;
+    const uint8_t* src = src_ptr + yi * (int64_t)src_stride;
    if (filtering == kFilterLinear) {
      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
    } else {
@ -1161,8 +1168,8 @@ static int ScalePlaneBilinearDown_16(int src_width,
                                     int src_height,
                                     int dst_width,
                                     int dst_height,
-                                     ptrdiff_t src_stride,
-                                     ptrdiff_t dst_stride,
+                                     int src_stride,
+                                     int dst_stride,
                                     const uint16_t* src_ptr,
                                     uint16_t* dst_ptr,
                                     enum FilterMode filtering) {
@ -1189,6 +1196,14 @@ static int ScalePlaneBilinearDown_16(int src_width,
             &dx, &dy);
  src_width = Abs(src_width);

+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_16_Any_SSE2;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_16_Any_SSSE3;
@ -1230,7 +1245,7 @@ static int ScalePlaneBilinearDown_16(int src_width,

  for (j = 0; j < dst_height; ++j) {
    int yi = y >> 16;
-    const uint16_t* src = src_ptr + yi * src_stride;
+    const uint16_t* src = src_ptr + yi * (int64_t)src_stride;
    if (filtering == kFilterLinear) {
      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
    } else {
@ -1253,8 +1268,8 @@ static int ScalePlaneBilinearUp(int src_width,
                                int src_height,
                                int dst_width,
                                int dst_height,
-                                ptrdiff_t src_stride,
-                                ptrdiff_t dst_stride,
+                                int src_stride,
+                                int dst_stride,
                                const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                enum FilterMode filtering) {
@ -1275,6 +1290,14 @@ static int ScalePlaneBilinearUp(int src_width,
             &dx, &dy);
  src_width = Abs(src_width);

+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    InterpolateRow = InterpolateRow_Any_AVX2;
@ -1340,7 +1363,7 @@ static int ScalePlaneBilinearUp(int src_width,
  }
  {
    int yi = y >> 16;
-    const uint8_t* src = src_ptr + yi * src_stride;
+    const uint8_t* src = src_ptr + yi * (int64_t)src_stride;

    // Allocate 2 row buffers.
    const int row_size = (dst_width + 31) & ~31;
@ -1349,7 +1372,7 @@ static int ScalePlaneBilinearUp(int src_width,
      return 1;

    uint8_t* rowptr = row;
-    ptrdiff_t rowstride = row_size;
+    int rowstride = row_size;
    int lasty = yi;

    ScaleFilterCols(rowptr, src, dst_width, x, dx);
@ -1367,7 +1390,7 @@ static int ScalePlaneBilinearUp(int src_width,
        if (y > max_y) {
          y = max_y;
          yi = y >> 16;
-          src = src_ptr + yi * src_stride;
+          src = src_ptr + yi * (int64_t)src_stride;
        }
        if (yi != lasty) {
          ScaleFilterCols(rowptr, src, dst_width, x, dx);
@ -1402,8 +1425,8 @@ static void ScalePlaneUp2_Linear(int src_width,
                                 int src_height,
                                 int dst_width,
                                 int dst_height,
-                                 ptrdiff_t src_stride,
-                                 ptrdiff_t dst_stride,
+                                 int src_stride,
+                                 int dst_stride,
                                 const uint8_t* src_ptr,
                                 uint8_t* dst_ptr) {
  void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
@ -1446,13 +1469,13 @@ static void ScalePlaneUp2_Linear(int src_width,
 #endif

  if (dst_height == 1) {
-    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
+    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
               dst_width);
  } else {
    dy = FixedDiv(src_height - 1, dst_height - 1);
    y = (1 << 15) - 1;
    for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
+      ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
      dst_ptr += dst_stride;
      y += dy;
    }
@ -1467,8 +1490,8 @@ static void ScalePlaneUp2_Bilinear(int src_width,
                                   int src_height,
                                   int dst_width,
                                   int dst_height,
-                                   ptrdiff_t src_stride,
-                                   ptrdiff_t dst_stride,
+                                   int src_stride,
+                                   int dst_stride,
                                   const uint8_t* src_ptr,
                                   uint8_t* dst_ptr) {
  void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
@ -1533,8 +1556,8 @@ static void ScalePlaneUp2_12_Linear(int src_width,
                                    int src_height,
                                    int dst_width,
                                    int dst_height,
-                                    ptrdiff_t src_stride,
-                                    ptrdiff_t dst_stride,
+                                    int src_stride,
+                                    int dst_stride,
                                    const uint16_t* src_ptr,
                                    uint16_t* dst_ptr) {
  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
@ -1566,13 +1589,13 @@ static void ScalePlaneUp2_12_Linear(int src_width,
 #endif

  if (dst_height == 1) {
-    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
+    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
               dst_width);
  } else {
    dy = FixedDiv(src_height - 1, dst_height - 1);
    y = (1 << 15) - 1;
    for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
+      ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
      dst_ptr += dst_stride;
      y += dy;
    }
@ -1588,8 +1611,8 @@ static void ScalePlaneUp2_12_Bilinear(int src_width,
                                      int src_height,
                                      int dst_width,
                                      int dst_height,
-                                      ptrdiff_t src_stride,
-                                      ptrdiff_t dst_stride,
+                                      int src_stride,
+                                      int dst_stride,
                                      const uint16_t* src_ptr,
                                      uint16_t* dst_ptr) {
  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
@ -1636,8 +1659,8 @@ static void ScalePlaneUp2_16_Linear(int src_width,
                                    int src_height,
                                    int dst_width,
                                    int dst_height,
-                                    ptrdiff_t src_stride,
-                                    ptrdiff_t dst_stride,
+                                    int src_stride,
+                                    int dst_stride,
                                    const uint16_t* src_ptr,
                                    uint16_t* dst_ptr) {
  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
@ -1669,13 +1692,13 @@ static void ScalePlaneUp2_16_Linear(int src_width,
 #endif

  if (dst_height == 1) {
-    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
+    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
               dst_width);
  } else {
    dy = FixedDiv(src_height - 1, dst_height - 1);
    y = (1 << 15) - 1;
    for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
+      ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
      dst_ptr += dst_stride;
      y += dy;
    }
@ -1686,8 +1709,8 @@ static void ScalePlaneUp2_16_Bilinear(int src_width,
                                      int src_height,
                                      int dst_width,
                                      int dst_height,
-                                      ptrdiff_t src_stride,
-                                      ptrdiff_t dst_stride,
+                                      int src_stride,
+                                      int dst_stride,
                                      const uint16_t* src_ptr,
                                      uint16_t* dst_ptr) {
  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
@ -1734,8 +1757,8 @@ static int ScalePlaneBilinearUp_16(int src_width,
                                   int src_height,
                                   int dst_width,
                                   int dst_height,
-                                   ptrdiff_t src_stride,
-                                   ptrdiff_t dst_stride,
+                                   int src_stride,
+                                   int dst_stride,
                                   const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   enum FilterMode filtering) {
@ -1756,6 +1779,14 @@ static int ScalePlaneBilinearUp_16(int src_width,
             &dx, &dy);
  src_width = Abs(src_width);

+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_16_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_16_Any_SSSE3;
@ -1807,12 +1838,12 @@ static int ScalePlaneBilinearUp_16(int src_width,
  }
  {
    int yi = y >> 16;
-    const uint16_t* src = src_ptr + yi * src_stride;
+    const uint16_t* src = src_ptr + yi * (int64_t)src_stride;

    // Allocate 2 row buffers.
    const int row_size = (dst_width + 31) & ~31;
    align_buffer_64(row, row_size * 4);
-    ptrdiff_t rowstride = row_size;
+    int rowstride = row_size;
    int lasty = yi;
    uint16_t* rowptr = (uint16_t*)row;
    if (!row)
@ -1833,7 +1864,7 @@ static int ScalePlaneBilinearUp_16(int src_width,
        if (y > max_y) {
          y = max_y;
          yi = y >> 16;
-          src = src_ptr + yi * src_stride;
+          src = src_ptr + yi * (int64_t)src_stride;
        }
        if (yi != lasty) {
          ScaleFilterCols(rowptr, src, dst_width, x, dx);
@ -1868,8 +1899,8 @@ static void ScalePlaneSimple(int src_width,
                             int src_height,
                             int dst_width,
                             int dst_height,
-                             ptrdiff_t src_stride,
-                             ptrdiff_t dst_stride,
+                             int src_stride,
+                             int dst_stride,
                             const uint8_t* src_ptr,
                             uint8_t* dst_ptr) {
  int i;
@ -1894,7 +1925,8 @@ static void ScalePlaneSimple(int src_width,
  }

  for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x,
+              dx);
    dst_ptr += dst_stride;
    y += dy;
  }
@ -1904,8 +1936,8 @@ static void ScalePlaneSimple_16(int src_width,
                                int src_height,
                                int dst_width,
                                int dst_height,
-                                ptrdiff_t src_stride,
-                                ptrdiff_t dst_stride,
+                                int src_stride,
+                                int dst_stride,
                                const uint16_t* src_ptr,
                                uint16_t* dst_ptr) {
  int i;
@ -1930,7 +1962,8 @@ static void ScalePlaneSimple_16(int src_width,
  }

  for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x,
+              dx);
    dst_ptr += dst_stride;
    y += dy;
  }
@ -1948,14 +1981,6 @@ int ScalePlane(const uint8_t* src,
               int dst_width,
               int dst_height,
               enum FilterMode filtering) {
-  // Reject dimensions larger than 32768 (or smaller than -32768 for height).
-  // This prevents FixedDiv signed integer overflows that can lead to division
-  // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations.
-  if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 ||
-      src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 ||
-      dst_height <= 0) {
-    return -1;
-  }
  // Simplify filtering when possible.
  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                filtering);
@ -1963,7 +1988,7 @@ int ScalePlane(const uint8_t* src,
  // Negative height means invert the image.
  if (src_height < 0) {
    src_height = -src_height;
-    src = src + (src_height - 1) * (ptrdiff_t)src_stride;
+    src = src + (src_height - 1) * (int64_t)src_stride;
    src_stride = -src_stride;
  }
  // Use specialized scales to improve performance for common resolutions.
@ -2056,14 +2081,6 @@ int ScalePlane_16(const uint16_t* src,
                  int dst_width,
                  int dst_height,
                  enum FilterMode filtering) {
-  // Reject dimensions larger than 32768 (or smaller than -32768 for height).
-  // This prevents FixedDiv signed integer overflows that can lead to division
-  // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations.
-  if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 ||
-      src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 ||
-      dst_height <= 0) {
-    return -1;
-  }
  // Simplify filtering when possible.
  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                filtering);
@ -2071,7 +2088,7 @@ int ScalePlane_16(const uint16_t* src,
  // Negative height means invert the image.
  if (src_height < 0) {
    src_height = -src_height;
-    src = src + (src_height - 1) * (ptrdiff_t)src_stride;
+    src = src + (src_height - 1) * (int64_t)src_stride;
    src_stride = -src_stride;
  }
  // Use specialized scales to improve performance for common resolutions.
@ -2168,14 +2185,6 @@ int ScalePlane_12(const uint16_t* src,
                  int dst_width,
                  int dst_height,
                  enum FilterMode filtering) {
-  // Reject dimensions larger than 32768 (or smaller than -32768 for height).
-  // This prevents FixedDiv signed integer overflows that can lead to division
-  // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations.
-  if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 ||
-      src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 ||
-      dst_height <= 0) {
-    return -1;
-  }
  // Simplify filtering when possible.
  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                filtering);
@ -2183,7 +2192,7 @@ int ScalePlane_12(const uint16_t* src,
  // Negative height means invert the image.
  if (src_height < 0) {
    src_height = -src_height;
-    src = src + (src_height - 1) * (ptrdiff_t)src_stride;
+    src = src + (src_height - 1) * (int64_t)src_stride;
    src_stride = -src_stride;
  }

@ -2224,17 +2233,17 @@ int I420Scale(const uint8_t* src_y,
              int dst_width,
              int dst_height,
              enum FilterMode filtering) {
-  int r;
-
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
-    return -1;
-  }
  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }

  r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
                 dst_stride_y, dst_width, dst_height, filtering);
@ -2269,17 +2278,17 @@ int I420Scale_16(const uint16_t* src_y,
                 int dst_width,
                 int dst_height,
                 enum FilterMode filtering) {
-  int r;
-
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
-    return -1;
-  }
  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }

  r = ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y,
                    dst_stride_y, dst_width, dst_height, filtering);
@ -2314,17 +2323,17 @@ int I420Scale_12(const uint16_t* src_y,
                 int dst_width,
                 int dst_height,
                 enum FilterMode filtering) {
-  int r;
-
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
-    return -1;
-  }
  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int r;
+
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }

  r = ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y,
                    dst_stride_y, dst_width, dst_height, filtering);
@ -2365,8 +2374,8 @@ int I444Scale(const uint8_t* src_y,
  int r;

  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }

@ -2406,8 +2415,8 @@ int I444Scale_16(const uint16_t* src_y,
  int r;

  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }

@ -2447,8 +2456,8 @@ int I444Scale_12(const uint16_t* src_y,
  int r;

  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }

@ -2488,15 +2497,15 @@ int I422Scale(const uint8_t* src_y,
              int dst_width,
              int dst_height,
              enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int r;

  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);

  r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
                 dst_stride_y, dst_width, dst_height, filtering);
@ -2531,15 +2540,15 @@ int I422Scale_16(const uint16_t* src_y,
                 int dst_width,
                 int dst_height,
                 enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int r;

  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);

  r = ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y,
                    dst_stride_y, dst_width, dst_height, filtering);
@ -2574,15 +2583,15 @@ int I422Scale_12(const uint16_t* src_y,
                 int dst_width,
                 int dst_height,
                 enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int r;

  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_u || !dst_v || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);

  r = ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y,
                    dst_stride_y, dst_width, dst_height, filtering);
@ -2616,17 +2625,17 @@ int NV12Scale(const uint8_t* src_y,
              int dst_width,
              int dst_height,
              enum FilterMode filtering) {
-  int r;
-
-  if (!src_y || !src_uv || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_uv || dst_width <= 0 ||
-      dst_height <= 0) {
-    return -1;
-  }
  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int r;
+
+  if (!src_y || !src_uv || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }

  r = ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y,
                 dst_stride_y, dst_width, dst_height, filtering);
@ -2655,8 +2664,8 @@ int NV24Scale(const uint8_t* src_y,
  int r;

  if (!src_y || !src_uv || src_width <= 0 || src_height == 0 ||
-      src_height == INT_MIN || !dst_y || !dst_uv || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }

--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@ -38,8 +38,8 @@ static void ScaleARGBDown2(int src_width,
                           int src_height,
                           int dst_width,
                           int dst_height,
-                           ptrdiff_t src_stride,
-                           ptrdiff_t dst_stride,
+                           int src_stride,
+                           int dst_stride,
                           const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int x,
@ -48,7 +48,7 @@ static void ScaleARGBDown2(int src_width,
                           int dy,
                           enum FilterMode filtering) {
  int j;
-  ptrdiff_t row_stride = src_stride * (dy >> 16);
+  int row_stride = src_stride * (dy >> 16);
  void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
                            uint8_t* dst_argb, int dst_width) =
      filtering == kFilterNone
@ -62,9 +62,9 @@ static void ScaleARGBDown2(int src_width,
  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
  // Advance to odd row, even column.
  if (filtering == kFilterBilinear) {
-    src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+    src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
  } else {
-    src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+    src_argb += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 4;
  }

 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
@ -152,8 +152,8 @@ static int ScaleARGBDown4Box(int src_width,
                             int src_height,
                             int dst_width,
                             int dst_height,
-                             ptrdiff_t src_stride,
-                             ptrdiff_t dst_stride,
+                             int src_stride,
+                             int dst_stride,
                             const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int x,
@ -169,12 +169,12 @@ static int ScaleARGBDown4Box(int src_width,
  align_buffer_64(row, row_size * 2);
  if (!row)
    return 1;
-  ptrdiff_t row_stride = src_stride * (dy >> 16);
+  int row_stride = src_stride * (dy >> 16);
  void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
                            uint8_t* dst_argb, int dst_width) =
      ScaleARGBRowDown2Box_C;
  // Advance to odd row, even column.
-  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
  (void)src_width;
  (void)src_height;
  (void)dx;
@ -226,8 +226,8 @@ static void ScaleARGBDownEven(int src_width,
                              int src_height,
                              int dst_width,
                              int dst_height,
-                              ptrdiff_t src_stride,
-                              ptrdiff_t dst_stride,
+                              int src_stride,
+                              int dst_stride,
                              const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int x,
@ -237,7 +237,7 @@ static void ScaleARGBDownEven(int src_width,
                              enum FilterMode filtering) {
  int j;
  int col_step = dx >> 16;
-  ptrdiff_t row_stride = (dy >> 16) * src_stride;
+  ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride);
  void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
                               int src_step, uint8_t* dst_argb, int dst_width) =
      filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
@ -245,7 +245,7 @@ static void ScaleARGBDownEven(int src_width,
  (void)src_height;
  assert(IS_ALIGNED(src_width, 2));
  assert(IS_ALIGNED(src_height, 2));
-  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
@ -302,8 +302,8 @@ static int ScaleARGBBilinearDown(int src_width,
                                 int src_height,
                                 int dst_width,
                                 int dst_height,
-                                 ptrdiff_t src_stride,
-                                 ptrdiff_t dst_stride,
+                                 int src_stride,
+                                 int dst_stride,
                                 const uint8_t* src_argb,
                                 uint8_t* dst_argb,
                                 int x,
@ -331,6 +331,14 @@ static int ScaleARGBBilinearDown(int src_width,
  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
  src_argb += xl * 4;
  x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    InterpolateRow = InterpolateRow_Any_AVX2;
@ -405,7 +413,7 @@ static int ScaleARGBBilinearDown(int src_width,
    }
    for (j = 0; j < dst_height; ++j) {
      int yi = y >> 16;
-      const uint8_t* src = src_argb + yi * src_stride;
+      const uint8_t* src = src_argb + yi * (intptr_t)src_stride;
      if (filtering == kFilterLinear) {
        ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
      } else {
@ -429,8 +437,8 @@ static int ScaleARGBBilinearUp(int src_width,
                               int src_height,
                               int dst_width,
                               int dst_height,
-                               ptrdiff_t src_stride,
-                               ptrdiff_t dst_stride,
+                               int src_stride,
+                               int dst_stride,
                               const uint8_t* src_argb,
                               uint8_t* dst_argb,
                               int x,
@ -446,6 +454,14 @@ static int ScaleARGBBilinearUp(int src_width,
                              int dst_width, int x, int dx) =
      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
  const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    InterpolateRow = InterpolateRow_Any_AVX2;
@ -546,7 +562,7 @@ static int ScaleARGBBilinearUp(int src_width,

  {
    int yi = y >> 16;
-    const uint8_t* src = src_argb + yi * src_stride;
+    const uint8_t* src = src_argb + yi * (intptr_t)src_stride;

    // Allocate 2 rows of ARGB.
    const int row_size = (dst_width * 4 + 31) & ~31;
@ -555,7 +571,7 @@ static int ScaleARGBBilinearUp(int src_width,
      return 1;

    uint8_t* rowptr = row;
-    ptrdiff_t rowstride = row_size;
+    int rowstride = row_size;
    int lasty = yi;

    ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
@ -573,7 +589,7 @@ static int ScaleARGBBilinearUp(int src_width,
        if (y > max_y) {
          y = max_y;
          yi = y >> 16;
-          src = src_argb + yi * src_stride;
+          src = src_argb + yi * (intptr_t)src_stride;
        }
        if (yi != lasty) {
          ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
@ -599,6 +615,283 @@ static int ScaleARGBBilinearUp(int src_width,
  return 0;
 }

+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+static int ScaleYUVToARGBBilinearUp(int src_width,
+                                    int src_height,
+                                    int dst_width,
+                                    int dst_height,
+                                    int src_stride_y,
+                                    int src_stride_u,
+                                    int src_stride_v,
+                                    int dst_stride_argb,
+                                    const uint8_t* src_y,
+                                    const uint8_t* src_u,
+                                    const uint8_t* src_v,
+                                    uint8_t* dst_argb,
+                                    int x,
+                                    int dx,
+                                    int y,
+                                    int dy,
+                                    enum FilterMode filtering) {
+  int j;
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf, int width) =
+      I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) ==
+      (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
+    if (IS_ALIGNED(src_width, 32)) {
+      I422ToARGBRow = I422ToARGBRow_AVX512BW;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    I422ToARGBRow = I422ToARGBRow_SVE2;
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    I422ToARGBRow = I422ToARGBRow_SME;
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LSX;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    I422ToARGBRow = I422ToARGBRow_Any_LASX;
+    if (IS_ALIGNED(src_width, 32)) {
+      I422ToARGBRow = I422ToARGBRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
+
+  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    InterpolateRow = InterpolateRow_Any_LSX;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    InterpolateRow = InterpolateRow_RVV;
+  }
+#endif
+
+  void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb,
+                              int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols =
+        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_LSX)
+  if (filtering && TestCpuFlag(kCpuHasLSX)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_LSX;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_RVV)
+  if (filtering && TestCpuFlag(kCpuHasRVV)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_RVV;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_LSX)
+  if (!filtering && TestCpuFlag(kCpuHasLSX)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_LSX;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBCols_LSX;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  const int max_y = (src_height - 1) << 16;
+  if (y > max_y) {
+    y = max_y;
+  }
+  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
+  int yi = y >> 16;
+  int uv_yi = yi >> kYShift;
+  const uint8_t* src_row_y = src_y + yi * (intptr_t)src_stride_y;
+  const uint8_t* src_row_u = src_u + uv_yi * (intptr_t)src_stride_u;
+  const uint8_t* src_row_v = src_v + uv_yi * (intptr_t)src_stride_v;
+
+  // Allocate 1 row of ARGB for source conversion and 2 rows of ARGB
+  // scaled horizontally to the destination width.
+  const int row_size = (dst_width * 4 + 31) & ~31;
+  align_buffer_64(row, row_size * 2 + src_width * 4);
+
+  uint8_t* argb_row = row + row_size * 2;
+  uint8_t* rowptr = row;
+  int rowstride = row_size;
+  int lasty = yi;
+  if (!row)
+    return 1;
+
+  // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
+  ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+  if (src_height > 1) {
+    src_row_y += src_stride_y;
+    if (yi & 1) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+  ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+  if (src_height > 2) {
+    src_row_y += src_stride_y;
+    if (!(yi & 1)) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    yi = y >> 16;
+    if (yi != lasty) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+        uv_yi = yi >> kYShift;
+        src_row_y = src_y + yi * (intptr_t)src_stride_y;
+        src_row_u = src_u + uv_yi * (intptr_t)src_stride_u;
+        src_row_v = src_v + uv_yi * (intptr_t)src_stride_v;
+      }
+      if (yi != lasty) {
+        // TODO(fbarchard): Convert the clipped region of row.
+        I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+        ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+        rowptr += rowstride;
+        rowstride = -rowstride;
+        lasty = yi;
+        src_row_y += src_stride_y;
+        if (yi & 1) {
+          src_row_u += src_stride_u;
+          src_row_v += src_stride_v;
+        }
+      }
+    }
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
+    dst_argb += dst_stride_argb;
+    y += dy;
+  }
+  free_aligned_buffer_64(row);
+  return 0;
+}
+#endif
+
 // Scale ARGB to/from any dimensions, without interpolation.
 // Fixed point math is used for performance: The upper 16 bits
 // of x and dx is the integer part of the source position and
@ -608,8 +901,8 @@ static void ScaleARGBSimple(int src_width,
                            int src_height,
                            int dst_width,
                            int dst_height,
-                            ptrdiff_t src_stride,
-                            ptrdiff_t dst_stride,
+                            int src_stride,
+                            int dst_stride,
                            const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int x,
@ -652,8 +945,8 @@ static void ScaleARGBSimple(int src_width,
  }

  for (j = 0; j < dst_height; ++j) {
-    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
-                  dx);
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (intptr_t)src_stride,
+                  dst_width, x, dx);
    dst_argb += dst_stride;
    y += dy;
  }
@ -688,7 +981,7 @@ static int ScaleARGB(const uint8_t* src,
  // Negative src_height means invert the image.
  if (src_height < 0) {
    src_height = -src_height;
-    src = src + (src_height - 1) * (ptrdiff_t)src_stride;
+    src = src + (src_height - 1) * (intptr_t)src_stride;
    src_stride = -src_stride;
  }
  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@ -703,8 +996,8 @@ static int ScaleARGB(const uint8_t* src,
  if (clip_y) {
    int64_t clipf = (int64_t)(clip_y)*dy;
    y += (clipf & 0xffff);
-    src += (clipf >> 16) * (ptrdiff_t)src_stride;
-    dst += clip_y * (ptrdiff_t)dst_stride;
+    src += (clipf >> 16) * (intptr_t)src_stride;
+    dst += clip_y * dst_stride;
  }

  // Special case for integer step values.
@ -737,7 +1030,7 @@ static int ScaleARGB(const uint8_t* src,
        filtering = kFilterNone;
        if (dx == 0x10000 && dy == 0x10000) {
          // Straight copy.
-          ARGBCopy(src + (y >> 16) * (ptrdiff_t)src_stride + (x >> 16) * 4,
+          ARGBCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4,
                   src_stride, dst, dst_stride, clip_width, clip_height);
          return 0;
        }
@ -779,9 +1072,9 @@ int ARGBScaleClip(const uint8_t* src_argb,
                  int clip_width,
                  int clip_height,
                  enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 || src_height == INT_MIN ||
-      !dst_argb || dst_width <= 0 || dst_height <= 0 || clip_x < 0 ||
-      clip_y < 0 || clip_width > 32768 || clip_height > 32768 ||
+  if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb ||
+      dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 ||
+      clip_width > 32768 || clip_height > 32768 ||
      (clip_x + clip_width) > dst_width ||
      (clip_y + clip_height) > dst_height) {
    return -1;
@ -802,9 +1095,8 @@ int ARGBScale(const uint8_t* src_argb,
              int dst_width,
              int dst_height,
              enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 || src_height == INT_MIN ||
-      src_width > 32768 || src_height > 32768 || !dst_argb || dst_width <= 0 ||
-      dst_height <= 0) {
+  if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 ||
+      src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
  return ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
@ -836,13 +1128,12 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
  int r;
  (void)src_fourcc;  // TODO(fbarchard): implement and/or assert.
  (void)dst_fourcc;
+  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
  if (!src_y || !src_u || !src_v || !dst_argb || src_width <= 0 ||
-      src_width > INT_MAX / 4 || src_height == 0 || src_height == INT_MIN ||
-      dst_width <= 0 || dst_height <= 0 || clip_width <= 0 ||
-      clip_height <= 0) {
+      src_width > INT_MAX / 4 || src_height == 0 || dst_width <= 0 ||
+      dst_height <= 0 || clip_width <= 0 || clip_height <= 0) {
    return -1;
  }
-  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
  const uint64_t argb_buffer_size = (uint64_t)src_width * abs_src_height * 4;
  if (argb_buffer_size > SIZE_MAX) {
    return -1;  // Invalid size.
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@ -792,10 +792,10 @@ void ScaleFilterCols64_C(uint8_t* dst_ptr,
 #undef BLENDER

 // Same as 8 bit arm blender but return is cast to uint16_t
-#define BLENDER(a, b, f)                                                      \
-  (uint16_t)((int)(a) +                                                       \
-             (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> \
-                   16))
+#define BLENDER(a, b, f) \
+  (uint16_t)(            \
+      (int)(a) +         \
+      (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16))

 void ScaleFilterCols_16_C(uint16_t* dst_ptr,
                          const uint16_t* src_ptr,
@ -1196,7 +1196,7 @@ void ScaleARGBColsUp2_C(uint8_t* dst_argb,

 // TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
 // Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
 #define BLENDERC(a, b, f, s) \
  (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
 #define BLENDER(a, b, f)                                                 \
@ -1636,6 +1636,14 @@ void ScalePlaneVertical(int src_height,
  assert(dst_width > 0);
  assert(dst_height > 0);
  src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    InterpolateRow = InterpolateRow_Any_AVX2;
@ -1710,6 +1718,14 @@ void ScalePlaneVertical_16(int src_height,
  assert(dst_width > 0);
  assert(dst_height > 0);
  src_argb += (x >> 16) * wpp;
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_16_Any_SSE2;
+    if (IS_ALIGNED(dst_width_words, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_16_Any_SSSE3;
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@ -1759,25 +1759,25 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
                      uint16_t* dst_ptr,
                      int src_width) {
-  asm volatile("pxor        %%xmm5,%%xmm5                 \n"
+      asm volatile("pxor        %%xmm5,%%xmm5                 \n"

               // 16 pixel loop.
               LABELALIGN
-               "1:          \n"
-               "movdqu      (%0),%%xmm3                   \n"
-               "lea         0x10(%0),%0                   \n"  // src_ptr += 16
-               "movdqu      (%1),%%xmm0                   \n"
-               "movdqu      0x10(%1),%%xmm1               \n"
-               "movdqa      %%xmm3,%%xmm2                 \n"
-               "punpcklbw   %%xmm5,%%xmm2                 \n"
-               "punpckhbw   %%xmm5,%%xmm3                 \n"
-               "paddusw     %%xmm2,%%xmm0                 \n"
-               "paddusw     %%xmm3,%%xmm1                 \n"
-               "movdqu      %%xmm0,(%1)                   \n"
-               "movdqu      %%xmm1,0x10(%1)               \n"
-               "lea         0x20(%1),%1                   \n"
-               "sub         $0x10,%2                      \n"
-               "jg          1b                            \n"
+      "1:          \n"
+      "movdqu      (%0),%%xmm3                   \n"
+      "lea         0x10(%0),%0                   \n"  // src_ptr += 16
+      "movdqu      (%1),%%xmm0                   \n"
+      "movdqu      0x10(%1),%%xmm1               \n"
+      "movdqa      %%xmm3,%%xmm2                 \n"
+      "punpcklbw   %%xmm5,%%xmm2                 \n"
+      "punpckhbw   %%xmm5,%%xmm3                 \n"
+      "paddusw     %%xmm2,%%xmm0                 \n"
+      "paddusw     %%xmm3,%%xmm1                 \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "movdqu      %%xmm1,0x10(%1)               \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
               : "+r"(src_ptr),   // %0
                 "+r"(dst_ptr),   // %1
                 "+r"(src_width)  // %2
@ -1790,23 +1790,23 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
                      uint16_t* dst_ptr,
                      int src_width) {
-  asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"

               LABELALIGN
-               "1:          \n"
-               "vmovdqu     (%0),%%ymm3                   \n"
-               "lea         0x20(%0),%0                   \n"  // src_ptr += 32
-               "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
-               "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
-               "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
-               "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
-               "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
-               "vmovdqu     %%ymm0,(%1)                   \n"
-               "vmovdqu     %%ymm1,0x20(%1)               \n"
-               "lea         0x40(%1),%1                   \n"
-               "sub         $0x20,%2                      \n"
-               "jg          1b                            \n"
-               "vzeroupper  \n"
+      "1:          \n"
+      "vmovdqu     (%0),%%ymm3                   \n"
+      "lea         0x20(%0),%0                   \n"  // src_ptr += 32
+      "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
+      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
+      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
+      "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
+      "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm1,0x20(%1)               \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
               : "+r"(src_ptr),   // %0
                 "+r"(dst_ptr),   // %1
                 "+r"(src_width)  // %2
--- a/source/scale_rgb.cc
+++ b/source/scale_rgb.cc
@ -42,8 +42,8 @@ int RGBScale(const uint8_t* src_rgb,
             enum FilterMode filtering) {
  int r;
  if (!src_rgb || !dst_rgb || src_width <= 0 || src_width > INT_MAX / 4 ||
-      src_height == 0 || src_height == INT_MIN || dst_width <= 0 ||
-      dst_width > INT_MAX / 4 || dst_height <= 0) {
+      src_height == 0 || dst_width <= 0 || dst_width > INT_MAX / 4 ||
+      dst_height <= 0) {
    return -1;
  }
  const int abs_src_height = (src_height < 0) ? -src_height : src_height;
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@ -11,7 +11,6 @@
 #include "libyuv/scale_uv.h"

 #include <assert.h>
-#include <limits.h>
 #include <string.h>

 #include "libyuv/cpu_id.h"
@ -60,8 +59,8 @@ static void ScaleUVDown2(int src_width,
                         int src_height,
                         int dst_width,
                         int dst_height,
-                         ptrdiff_t src_stride,
-                         ptrdiff_t dst_stride,
+                         int src_stride,
+                         int dst_stride,
                         const uint8_t* src_uv,
                         uint8_t* dst_uv,
                         int x,
@ -70,7 +69,7 @@ static void ScaleUVDown2(int src_width,
                         int dy,
                         enum FilterMode filtering) {
  int j;
-  ptrdiff_t row_stride = src_stride * (dy >> 16);
+  int row_stride = src_stride * (dy >> 16);
  void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
                          uint8_t* dst_uv, int dst_width) =
      filtering == kFilterNone
@ -84,9 +83,9 @@ static void ScaleUVDown2(int src_width,
  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
  // Advance to odd row, even column.
  if (filtering == kFilterBilinear) {
-    src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+    src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
  } else {
-    src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2;
+    src_uv += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 2;
  }

 #if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
@ -175,8 +174,8 @@ static int ScaleUVDown4Box(int src_width,
                           int src_height,
                           int dst_width,
                           int dst_height,
-                           ptrdiff_t src_stride,
-                           ptrdiff_t dst_stride,
+                           int src_stride,
+                           int dst_stride,
                           const uint8_t* src_uv,
                           uint8_t* dst_uv,
                           int x,
@ -189,12 +188,12 @@ static int ScaleUVDown4Box(int src_width,
  align_buffer_64(row, row_size * 2);
  if (!row)
    return 1;
-  ptrdiff_t row_stride = src_stride * (dy >> 16);
+  int row_stride = src_stride * (dy >> 16);
  void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
                          uint8_t* dst_uv, int dst_width) =
      ScaleUVRowDown2Box_C;
  // Advance to odd row, even column.
-  src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+  src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
  (void)src_width;
  (void)src_height;
  (void)dx;
@ -257,8 +256,8 @@ static void ScaleUVDownEven(int src_width,
                            int src_height,
                            int dst_width,
                            int dst_height,
-                            ptrdiff_t src_stride,
-                            ptrdiff_t dst_stride,
+                            int src_stride,
+                            int dst_stride,
                            const uint8_t* src_uv,
                            uint8_t* dst_uv,
                            int x,
@ -268,7 +267,7 @@ static void ScaleUVDownEven(int src_width,
                            enum FilterMode filtering) {
  int j;
  int col_step = dx >> 16;
-  ptrdiff_t row_stride = (dy >> 16) * src_stride;
+  ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride);
  void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride,
                             int src_step, uint8_t* dst_uv, int dst_width) =
      filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C;
@ -276,7 +275,7 @@ static void ScaleUVDownEven(int src_width,
  (void)src_height;
  assert(IS_ALIGNED(src_width, 2));
  assert(IS_ALIGNED(src_height, 2));
-  src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
+  src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2;
 #if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
@ -335,8 +334,8 @@ static int ScaleUVBilinearDown(int src_width,
                               int src_height,
                               int dst_width,
                               int dst_height,
-                               ptrdiff_t src_stride,
-                               ptrdiff_t dst_stride,
+                               int src_stride,
+                               int dst_stride,
                               const uint8_t* src_uv,
                               uint8_t* dst_uv,
                               int x,
@ -364,6 +363,14 @@ static int ScaleUVBilinearDown(int src_width,
  clip_src_width = (int)(xr - xl) * 2;  // Width aligned to 2.
  src_uv += xl * 2;
  x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    InterpolateRow = InterpolateRow_Any_AVX2;
@ -423,7 +430,7 @@ static int ScaleUVBilinearDown(int src_width,
    }
    for (j = 0; j < dst_height; ++j) {
      int yi = y >> 16;
-      const uint8_t* src = src_uv + yi * src_stride;
+      const uint8_t* src = src_uv + yi * (intptr_t)src_stride;
      if (filtering == kFilterLinear) {
        ScaleUVFilterCols(dst_uv, src, dst_width, x, dx);
      } else {
@ -449,8 +456,8 @@ static int ScaleUVBilinearUp(int src_width,
                             int src_height,
                             int dst_width,
                             int dst_height,
-                             ptrdiff_t src_stride,
-                             ptrdiff_t dst_stride,
+                             int src_stride,
+                             int dst_stride,
                             const uint8_t* src_uv,
                             uint8_t* dst_uv,
                             int x,
@ -466,6 +473,14 @@ static int ScaleUVBilinearUp(int src_width,
                            int dst_width, int x, int dx) =
      filtering ? ScaleUVFilterCols_C : ScaleUVCols_C;
  const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    InterpolateRow = InterpolateRow_Any_AVX2;
@ -544,7 +559,7 @@ static int ScaleUVBilinearUp(int src_width,

  {
    int yi = y >> 16;
-    const uint8_t* src = src_uv + yi * src_stride;
+    const uint8_t* src = src_uv + yi * (intptr_t)src_stride;

    // Allocate 2 rows of UV.
    const int row_size = (dst_width * 2 + 15) & ~15;
@ -553,7 +568,7 @@ static int ScaleUVBilinearUp(int src_width,
      return 1;

    uint8_t* rowptr = row;
-    ptrdiff_t rowstride = row_size;
+    int rowstride = row_size;
    int lasty = yi;

    ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
@ -571,7 +586,7 @@ static int ScaleUVBilinearUp(int src_width,
        if (y > max_y) {
          y = max_y;
          yi = y >> 16;
-          src = src_uv + yi * src_stride;
+          src = src_uv + yi * (intptr_t)src_stride;
        }
        if (yi != lasty) {
          ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
@ -607,8 +622,8 @@ static void ScaleUVLinearUp2(int src_width,
                             int src_height,
                             int dst_width,
                             int dst_height,
-                             ptrdiff_t src_stride,
-                             ptrdiff_t dst_stride,
+                             int src_stride,
+                             int dst_stride,
                             const uint8_t* src_uv,
                             uint8_t* dst_uv) {
  void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) =
@ -646,12 +661,13 @@ static void ScaleUVLinearUp2(int src_width,
 #endif

  if (dst_height == 1) {
-    ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width);
+    ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv,
+               dst_width);
  } else {
    dy = FixedDiv(src_height - 1, dst_height - 1);
    y = (1 << 15) - 1;
    for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width);
+      ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width);
      dst_uv += dst_stride;
      y += dy;
    }
@ -727,8 +743,8 @@ static void ScaleUVLinearUp2_16(int src_width,
                                int src_height,
                                int dst_width,
                                int dst_height,
-                                ptrdiff_t src_stride,
-                                ptrdiff_t dst_stride,
+                                int src_stride,
+                                int dst_stride,
                                const uint16_t* src_uv,
                                uint16_t* dst_uv) {
  void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
@ -760,12 +776,13 @@ static void ScaleUVLinearUp2_16(int src_width,
 #endif

  if (dst_height == 1) {
-    ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width);
+    ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv,
+               dst_width);
  } else {
    dy = FixedDiv(src_height - 1, dst_height - 1);
    y = (1 << 15) - 1;
    for (i = 0; i < dst_height; ++i) {
-      ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width);
+      ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width);
      dst_uv += dst_stride;
      y += dy;
    }
@ -835,8 +852,8 @@ static void ScaleUVSimple(int src_width,
                          int src_height,
                          int dst_width,
                          int dst_height,
-                          ptrdiff_t src_stride,
-                          ptrdiff_t dst_stride,
+                          int src_stride,
+                          int dst_stride,
                          const uint8_t* src_uv,
                          uint8_t* dst_uv,
                          int x,
@ -871,7 +888,8 @@ static void ScaleUVSimple(int src_width,
  }

  for (j = 0; j < dst_height; ++j) {
-    ScaleUVCols(dst_uv, src_uv + (y >> 16) * src_stride, dst_width, x, dx);
+    ScaleUVCols(dst_uv, src_uv + (y >> 16) * (intptr_t)src_stride, dst_width, x,
+                dx);
    dst_uv += dst_stride;
    y += dy;
  }
@ -885,13 +903,13 @@ static int UVCopy(const uint8_t* src_uv,
                  int dst_stride_uv,
                  int width,
                  int height) {
-  if (!src_uv || !dst_uv || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_uv || !dst_uv || width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    src_uv = src_uv + (height - 1) * (ptrdiff_t)src_stride_uv;
+    src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv;
    src_stride_uv = -src_stride_uv;
  }

@ -905,13 +923,13 @@ static int UVCopy_16(const uint16_t* src_uv,
                     int dst_stride_uv,
                     int width,
                     int height) {
-  if (!src_uv || !dst_uv || width <= 0 || height == 0 || height == INT_MIN) {
+  if (!src_uv || !dst_uv || width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    src_uv = src_uv + (height - 1) * (ptrdiff_t)src_stride_uv;
+    src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv;
    src_stride_uv = -src_stride_uv;
  }

@ -949,7 +967,7 @@ static int ScaleUV(const uint8_t* src,
  // Negative src_height means invert the image.
  if (src_height < 0) {
    src_height = -src_height;
-    src = src + (src_height - 1) * (ptrdiff_t)src_stride;
+    src = src + (src_height - 1) * (intptr_t)src_stride;
    src_stride = -src_stride;
  }
  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@ -964,8 +982,8 @@ static int ScaleUV(const uint8_t* src,
  if (clip_y) {
    int64_t clipf = (int64_t)(clip_y)*dy;
    y += (clipf & 0xffff);
-    src += (clipf >> 16) * (ptrdiff_t)src_stride;
-    dst += clip_y * (ptrdiff_t)dst_stride;
+    src += (clipf >> 16) * (intptr_t)src_stride;
+    dst += clip_y * dst_stride;
  }

  // Special case for integer step values.
@ -1005,8 +1023,9 @@ static int ScaleUV(const uint8_t* src,
 #ifdef HAS_UVCOPY
        if (dx == 0x10000 && dy == 0x10000) {
          // Straight copy.
-          return UVCopy(src + (y >> 16) * (ptrdiff_t)src_stride + (x >> 16) * 2,
-                        src_stride, dst, dst_stride, clip_width, clip_height);
+          UVCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2,
+                 src_stride, dst, dst_stride, clip_width, clip_height);
+          return 0;
        }
 #endif
      }
@ -1062,8 +1081,7 @@ int UVScale(const uint8_t* src_uv,
            int dst_height,
            enum FilterMode filtering) {
  if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 ||
-      src_height < -32768 || src_height > 32768 || !dst_uv || dst_width <= 0 ||
-      dst_height <= 0) {
+      src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
  return ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv,
@ -1085,9 +1103,8 @@ int UVScale_16(const uint16_t* src_uv,
               enum FilterMode filtering) {
  int dy = 0;

-  if (!src_uv || src_width <= 0 || src_height == 0 || src_height == INT_MIN ||
-      src_width > 32768 || src_height > 32768 || !dst_uv || dst_width <= 0 ||
-      dst_height <= 0) {
+  if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 ||
+      src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
    return -1;
  }

@ -1099,7 +1116,7 @@ int UVScale_16(const uint16_t* src_uv,
  // Negative src_height means invert the image.
  if (src_height < 0) {
    src_height = -src_height;
-    src_uv = src_uv + (src_height - 1) * (ptrdiff_t)src_stride_uv;
+    src_uv = src_uv + (src_height - 1) * (intptr_t)src_stride_uv;
    src_stride_uv = -src_stride_uv;
  }
  src_width = Abs(src_width);
@ -1107,17 +1124,16 @@ int UVScale_16(const uint16_t* src_uv,
 #ifdef HAS_UVCOPY
  if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) {
    if (dst_height == 1) {
-      return UVCopy_16(
-          src_uv + ((src_height - 1) / 2) * (ptrdiff_t)src_stride_uv,
-          src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height);
+      UVCopy_16(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride_uv,
+                src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height);
+    } else {
+      dy = src_height / dst_height;
+      UVCopy_16(src_uv + ((dy - 1) / 2) * (intptr_t)src_stride_uv,
+                (int)(dy * (intptr_t)src_stride_uv), dst_uv, dst_stride_uv,
+                dst_width, dst_height);
    }
-    dy = src_height / dst_height;
-    if (src_stride_uv > INT_MAX / dy) {
-      return -1;
-    }
-    return UVCopy_16(src_uv + ((dy - 1) / 2) * (ptrdiff_t)src_stride_uv,
-                     dy * src_stride_uv, dst_uv, dst_stride_uv, dst_width,
-                     dst_height);
+
+    return 0;
  }
 #endif

--- a/source/scale_win.cc
+++ b/source/scale_win.cc
@ -104,7 +104,7 @@ __declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
    movdqu     xmm0, [eax]
    movdqu     xmm1, [eax + 16]
    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8       // isolate odd pixels.
+    psrlw      xmm0, 8          // isolate odd pixels.
    psrlw      xmm1, 8
    packuswb   xmm0, xmm1
    movdqu     [edx], xmm0
@ -138,7 +138,7 @@ __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
    lea        eax,  [eax + 32]
    pmaddubsw  xmm0, xmm4  // horizontal add
    pmaddubsw  xmm1, xmm4
-    pavgw      xmm0, xmm5    // (x + 1) / 2
+    pavgw      xmm0, xmm5       // (x + 1) / 2
    pavgw      xmm1, xmm5
    packuswb   xmm0, xmm1
    movdqu     [edx], xmm0
@ -213,7 +213,7 @@ __declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
    vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
    vpsrlw      ymm1, ymm1, 8
    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8    // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
    vmovdqu     [edx], ymm0
    lea         edx, [edx + 32]
    sub         ecx, 32
@ -249,7 +249,7 @@ __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
    vpavgw      ymm1, ymm1, ymm5
    vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8    // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
    vmovdqu     [edx], ymm0
    lea         edx, [edx + 32]
    sub         ecx, 32
@ -319,7 +319,7 @@ __declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
    // src_stride ignored
    mov        edx, [esp + 12]  // dst_ptr
    mov        ecx, [esp + 16]  // dst_width
-    pcmpeqb    xmm5, xmm5     // generate mask 0x00ff0000
+    pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
    psrld      xmm5, 24
    pslld      xmm5, 16

@ -424,7 +424,7 @@ __declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
    vpsrlw      ymm0, ymm0, 8
    vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8    // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
    vmovdqu     [edx], xmm0
    lea         edx, [edx + 16]
    sub         ecx, 16
@ -687,7 +687,7 @@ __declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
    pshufb     xmm1, xmm5
    paddusb    xmm0, xmm1

-    movq       qword ptr [edx], xmm0    // write 12 pixels
+    movq       qword ptr [edx], xmm0       // write 12 pixels
    movhlps    xmm1, xmm0
    movd       [edx + 8], xmm1
    lea        edx, [edx + 12]
@ -1030,7 +1030,7 @@ __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
    lea        eax,  [eax + 32]
    movdqa     xmm2, xmm0
    shufps     xmm0, xmm1, 0x88  // even pixels
-    shufps     xmm2, xmm1, 0xdd    // odd pixels
+    shufps     xmm2, xmm1, 0xdd       // odd pixels
    pavgb      xmm0, xmm2
    movdqu     [edx], xmm0
    lea        edx, [edx + 16]
@ -1216,7 +1216,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
    test       ecx, 2
    je         xloop29

-         // 2 Pixels.
+        // 2 Pixels.
    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
    pextrw     eax, xmm2, 5  // get x2 integer.
@ -1229,7 +1229,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
    test       ecx, 1
    je         xloop99

-         // 1 Pixels.
+        // 1 Pixels.
    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
    movd       dword ptr [edi], xmm0
 xloop99:
--- a/unit_test/basictypes_test.cc
+++ b/unit_test/basictypes_test.cc
@ -22,22 +22,22 @@ TEST_F(LibYUVBaseTest, SizeOfTypes) {
  uint32_t u32 = 1u;
  int64_t i64 = -1;
  uint64_t u64 = 1u;
-  ASSERT_EQ(1u, sizeof(i8));
-  ASSERT_EQ(1u, sizeof(u8));
-  ASSERT_EQ(2u, sizeof(i16));
-  ASSERT_EQ(2u, sizeof(u16));
-  ASSERT_EQ(4u, sizeof(i32));
-  ASSERT_EQ(4u, sizeof(u32));
-  ASSERT_EQ(8u, sizeof(i64));
-  ASSERT_EQ(8u, sizeof(u64));
-  ASSERT_GT(0, i8);
-  ASSERT_LT(0u, u8);
-  ASSERT_GT(0, i16);
-  ASSERT_LT(0u, u16);
-  ASSERT_GT(0, i32);
-  ASSERT_LT(0u, u32);
-  ASSERT_GT(0, i64);
-  ASSERT_LT(0u, u64);
+  EXPECT_EQ(1u, sizeof(i8));
+  EXPECT_EQ(1u, sizeof(u8));
+  EXPECT_EQ(2u, sizeof(i16));
+  EXPECT_EQ(2u, sizeof(u16));
+  EXPECT_EQ(4u, sizeof(i32));
+  EXPECT_EQ(4u, sizeof(u32));
+  EXPECT_EQ(8u, sizeof(i64));
+  EXPECT_EQ(8u, sizeof(u64));
+  EXPECT_GT(0, i8);
+  EXPECT_LT(0u, u8);
+  EXPECT_GT(0, i16);
+  EXPECT_LT(0u, u16);
+  EXPECT_GT(0, i32);
+  EXPECT_LT(0u, u32);
+  EXPECT_GT(0, i64);
+  EXPECT_LT(0u, u64);
 }

 }  // namespace libyuv
--- a/unit_test/color_test.cc
+++ b/unit_test/color_test.cc
@ -22,8 +22,14 @@ namespace libyuv {

 // TODO(fbarchard): clang x86 has a higher accuracy YUV to RGB.
 // Port to Visual C and other CPUs
+#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || defined(__i386__))
+#define ERROR_FULL 5
+#define ERROR_J420 4
+#else
 #define ERROR_FULL 6
 #define ERROR_J420 6
+#endif
 #define ERROR_R 1
 #define ERROR_G 1
 #ifdef LIBYUV_UNLIMITED_DATA
@ -113,11 +119,11 @@ namespace libyuv {
    }                                                                          \
    /* Test C and SIMD match. */                                               \
    for (int i = 0; i < kPixels * 4; ++i) {                                    \
-      ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                           \
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);                           \
    }                                                                          \
    /* Test SIMD is close to original. */                                      \
    for (int i = 0; i < kPixels * 4; ++i) {                                    \
-      ASSERT_NEAR(static_cast<int>(orig_pixels[i]),                            \
+      EXPECT_NEAR(static_cast<int>(orig_pixels[i]),                            \
                  static_cast<int>(dst_pixels_opt[i]), DIFF);                  \
    }                                                                          \
                                                                               \
@ -425,16 +431,15 @@ TEST_F(LibYUVColorTest, TestRoundToByte) {
      allb |= b;
    }
  }
-  ASSERT_GE(allb, 0);
-  ASSERT_LE(allb, 255);
+  EXPECT_GE(allb, 0);
+  EXPECT_LE(allb, 255);
 }

 // BT.601 limited range YUV to RGB reference
 static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
-  double y1 = (y - 16) * 1.164;
-  *r = RoundToByte(y1 - (v - 128) * -1.596);
-  *g = RoundToByte(y1 - (u - 128) * 0.391 - (v - 128) * 0.813);
-  *b = RoundToByte(y1 - (u - 128) * -2.018);
+  *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596);
+  *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813);
+  *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018);
 }

 // BT.601 full range YUV to RGB reference (aka JPEG)
@ -447,10 +452,9 @@ static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
 // BT.709 limited range YUV to RGB reference
 // See also http://www.equasys.de/colorconversion.html
 static void YUVHToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
-  double y1 = (y - 16) * 1.164;
-  *r = RoundToByte(y1 - (v - 128) * -1.793);
-  *g = RoundToByte(y1 - (u - 128) * 0.213 - (v - 128) * 0.533);
-  *b = RoundToByte(y1 - (u - 128) * -2.112);
+  *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.793);
+  *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.213 - (v - 128) * 0.533);
+  *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.112);
 }

 // BT.709 full range YUV to RGB reference
@ -462,10 +466,10 @@ static void YUVFToRGBReference(int y, int u, int v, int* r, int* g, int* b) {

 // BT.2020 limited range YUV to RGB reference
 static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
-  double y1 = (y - 16) * 1.164384;
-  *r = RoundToByte(y1 - (v - 128) * -1.67867);
-  *g = RoundToByte(y1 - (u - 128) * 0.187326 - (v - 128) * 0.65042);
-  *b = RoundToByte(y1 - (u - 128) * -2.14177);
+  *r = RoundToByte((y - 16) * 1.164384 - (v - 128) * -1.67867);
+  *g = RoundToByte((y - 16) * 1.164384 - (u - 128) * 0.187326 -
+                   (v - 128) * 0.65042);
+  *b = RoundToByte((y - 16) * 1.164384 - (u - 128) * -2.14177);
 }

 // BT.2020 full range YUV to RGB reference
@ -480,48 +484,48 @@ TEST_F(LibYUVColorTest, TestYUV) {

  // cyan (less red)
  YUVToRGBReference(240, 255, 0, &r0, &g0, &b0);
-  ASSERT_EQ(56, r0);
-  ASSERT_EQ(255, g0);
-  ASSERT_EQ(255, b0);
+  EXPECT_EQ(56, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(255, b0);

  YUVToRGB(240, 255, 0, &r1, &g1, &b1);
-  ASSERT_EQ(57, r1);
-  ASSERT_EQ(255, g1);
-  ASSERT_EQ(255, b1);
+  EXPECT_EQ(57, r1);
+  EXPECT_EQ(255, g1);
+  EXPECT_EQ(255, b1);

  // green (less red and blue)
  YUVToRGBReference(240, 0, 0, &r0, &g0, &b0);
-  ASSERT_EQ(56, r0);
-  ASSERT_EQ(255, g0);
-  ASSERT_EQ(2, b0);
+  EXPECT_EQ(56, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(2, b0);

  YUVToRGB(240, 0, 0, &r1, &g1, &b1);
-  ASSERT_EQ(57, r1);
-  ASSERT_EQ(255, g1);
+  EXPECT_EQ(57, r1);
+  EXPECT_EQ(255, g1);
 #ifdef LIBYUV_UNLIMITED_DATA
-  ASSERT_EQ(3, b1);
+  EXPECT_EQ(3, b1);
 #else
-  ASSERT_EQ(5, b1);
+  EXPECT_EQ(5, b1);
 #endif

  for (int i = 0; i < 256; ++i) {
    YUVToRGBReference(i, 128, 128, &r0, &g0, &b0);
    YUVToRGB(i, 128, 128, &r1, &g1, &b1);
-    ASSERT_NEAR(r0, r1, ERROR_R);
-    ASSERT_NEAR(g0, g1, ERROR_G);
-    ASSERT_NEAR(b0, b1, ERROR_B);
+    EXPECT_NEAR(r0, r1, ERROR_R);
+    EXPECT_NEAR(g0, g1, ERROR_G);
+    EXPECT_NEAR(b0, b1, ERROR_B);

    YUVToRGBReference(i, 0, 0, &r0, &g0, &b0);
    YUVToRGB(i, 0, 0, &r1, &g1, &b1);
-    ASSERT_NEAR(r0, r1, ERROR_R);
-    ASSERT_NEAR(g0, g1, ERROR_G);
-    ASSERT_NEAR(b0, b1, ERROR_B);
+    EXPECT_NEAR(r0, r1, ERROR_R);
+    EXPECT_NEAR(g0, g1, ERROR_G);
+    EXPECT_NEAR(b0, b1, ERROR_B);

    YUVToRGBReference(i, 0, 255, &r0, &g0, &b0);
    YUVToRGB(i, 0, 255, &r1, &g1, &b1);
-    ASSERT_NEAR(r0, r1, ERROR_R);
-    ASSERT_NEAR(g0, g1, ERROR_G);
-    ASSERT_NEAR(b0, b1, ERROR_B);
+    EXPECT_NEAR(r0, r1, ERROR_R);
+    EXPECT_NEAR(g0, g1, ERROR_G);
+    EXPECT_NEAR(b0, b1, ERROR_B);
  }
 }

@ -530,47 +534,47 @@ TEST_F(LibYUVColorTest, TestGreyYUV) {

  // black
  YUVToRGBReference(16, 128, 128, &r0, &g0, &b0);
-  ASSERT_EQ(0, r0);
-  ASSERT_EQ(0, g0);
-  ASSERT_EQ(0, b0);
+  EXPECT_EQ(0, r0);
+  EXPECT_EQ(0, g0);
+  EXPECT_EQ(0, b0);

  YUVToRGB(16, 128, 128, &r1, &g1, &b1);
-  ASSERT_EQ(0, r1);
-  ASSERT_EQ(0, g1);
-  ASSERT_EQ(0, b1);
+  EXPECT_EQ(0, r1);
+  EXPECT_EQ(0, g1);
+  EXPECT_EQ(0, b1);

  // white
  YUVToRGBReference(240, 128, 128, &r0, &g0, &b0);
-  ASSERT_EQ(255, r0);
-  ASSERT_EQ(255, g0);
-  ASSERT_EQ(255, b0);
+  EXPECT_EQ(255, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(255, b0);

  YUVToRGB(240, 128, 128, &r1, &g1, &b1);
-  ASSERT_EQ(255, r1);
-  ASSERT_EQ(255, g1);
-  ASSERT_EQ(255, b1);
+  EXPECT_EQ(255, r1);
+  EXPECT_EQ(255, g1);
+  EXPECT_EQ(255, b1);

  // grey
  YUVToRGBReference(128, 128, 128, &r0, &g0, &b0);
-  ASSERT_EQ(130, r0);
-  ASSERT_EQ(130, g0);
-  ASSERT_EQ(130, b0);
+  EXPECT_EQ(130, r0);
+  EXPECT_EQ(130, g0);
+  EXPECT_EQ(130, b0);

  YUVToRGB(128, 128, 128, &r1, &g1, &b1);
-  ASSERT_EQ(130, r1);
-  ASSERT_EQ(130, g1);
-  ASSERT_EQ(130, b1);
+  EXPECT_EQ(130, r1);
+  EXPECT_EQ(130, g1);
+  EXPECT_EQ(130, b1);

  for (int y = 0; y < 256; ++y) {
    YUVToRGBReference(y, 128, 128, &r0, &g0, &b0);
    YUVToRGB(y, 128, 128, &r1, &g1, &b1);
    YToRGB(y, &r2, &g2, &b2);
-    ASSERT_EQ(r0, r1);
-    ASSERT_EQ(g0, g1);
-    ASSERT_EQ(b0, b1);
-    ASSERT_EQ(r0, r2);
-    ASSERT_EQ(g0, g2);
-    ASSERT_EQ(b0, b2);
+    EXPECT_EQ(r0, r1);
+    EXPECT_EQ(g0, g1);
+    EXPECT_EQ(b0, b1);
+    EXPECT_EQ(r0, r2);
+    EXPECT_EQ(g0, g2);
+    EXPECT_EQ(b0, b2);
  }
 }

@ -608,11 +612,10 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
 #ifdef DISABLE_SLOW_TESTS
 #define FASTSTEP 5
 #else
-#define FASTSTEP 3
+#define FASTSTEP 1
 #endif

 // BT.601 limited range.
-#ifndef DISABLE_SLOW_TESTS
 TEST_F(LibYUVColorTest, TestFullYUV) {
  int rh[256] = {
      0,
@ -623,16 +626,16 @@ TEST_F(LibYUVColorTest, TestFullYUV) {
  int bh[256] = {
      0,
  };
-  for (int u = 0; u < 256; u += FASTSTEP) {
-    for (int v = 0; v < 256; v += FASTSTEP) {
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
        int r0, g0, b0, r1, g1, b1;
        int y = RANDOM256(y2);
        YUVToRGBReference(y, u, v, &r0, &g0, &b0);
        YUVToRGB(y, u, v, &r1, &g1, &b1);
-        ASSERT_NEAR(r0, r1, ERROR_R);
-        ASSERT_NEAR(g0, g1, ERROR_G);
-        ASSERT_NEAR(b0, b1, ERROR_B);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
        ++rh[r1 - r0 + 128];
        ++gh[g1 - g0 + 128];
        ++bh[b1 - b0 + 128];
@ -653,16 +656,16 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) {
  int bh[256] = {
      0,
  };
-  for (int u = 0; u < 256; u += FASTSTEP) {
-    for (int v = 0; v < 256; v += FASTSTEP) {
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
        int r0, g0, b0, r1, g1, b1;
        int y = RANDOM256(y2);
        YUVJToRGBReference(y, u, v, &r0, &g0, &b0);
        YUVJToRGB(y, u, v, &r1, &g1, &b1);
-        ASSERT_NEAR(r0, r1, ERROR_R);
-        ASSERT_NEAR(g0, g1, ERROR_G);
-        ASSERT_NEAR(b0, b1, ERROR_B);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
        ++rh[r1 - r0 + 128];
        ++gh[g1 - g0 + 128];
        ++bh[b1 - b0 + 128];
@ -683,16 +686,16 @@ TEST_F(LibYUVColorTest, TestFullYUVH) {
  int bh[256] = {
      0,
  };
-  for (int u = 0; u < 256; u += FASTSTEP) {
-    for (int v = 0; v < 256; v += FASTSTEP) {
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
        int r0, g0, b0, r1, g1, b1;
        int y = RANDOM256(y2);
        YUVHToRGBReference(y, u, v, &r0, &g0, &b0);
        YUVHToRGB(y, u, v, &r1, &g1, &b1);
-        ASSERT_NEAR(r0, r1, ERROR_R);
-        ASSERT_NEAR(g0, g1, ERROR_G);
-        ASSERT_NEAR(b0, b1, ERROR_B);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
        ++rh[r1 - r0 + 128];
        ++gh[g1 - g0 + 128];
        ++bh[b1 - b0 + 128];
@ -713,16 +716,16 @@ TEST_F(LibYUVColorTest, TestFullYUVF) {
  int bh[256] = {
      0,
  };
-  for (int u = 0; u < 256; u += FASTSTEP) {
-    for (int v = 0; v < 256; v += FASTSTEP) {
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
        int r0, g0, b0, r1, g1, b1;
        int y = RANDOM256(y2);
        YUVFToRGBReference(y, u, v, &r0, &g0, &b0);
        YUVFToRGB(y, u, v, &r1, &g1, &b1);
-        ASSERT_NEAR(r0, r1, ERROR_R);
-        ASSERT_NEAR(g0, g1, ERROR_G);
-        ASSERT_NEAR(b0, b1, ERROR_B);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
        ++rh[r1 - r0 + 128];
        ++gh[g1 - g0 + 128];
        ++bh[b1 - b0 + 128];
@ -743,16 +746,16 @@ TEST_F(LibYUVColorTest, TestFullYUVU) {
  int bh[256] = {
      0,
  };
-  for (int u = 0; u < 256; u += FASTSTEP) {
-    for (int v = 0; v < 256; v += FASTSTEP) {
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
        int r0, g0, b0, r1, g1, b1;
        int y = RANDOM256(y2);
        YUVUToRGBReference(y, u, v, &r0, &g0, &b0);
        YUVUToRGB(y, u, v, &r1, &g1, &b1);
-        ASSERT_NEAR(r0, r1, ERROR_R);
-        ASSERT_NEAR(g0, g1, ERROR_G);
-        ASSERT_NEAR(b0, b1, ERROR_B);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, ERROR_G);
+        EXPECT_NEAR(b0, b1, ERROR_B);
        ++rh[r1 - r0 + 128];
        ++gh[g1 - g0 + 128];
        ++bh[b1 - b0 + 128];
@ -773,16 +776,16 @@ TEST_F(LibYUVColorTest, TestFullYUVV) {
  int bh[256] = {
      0,
  };
-  for (int u = 0; u < 256; u += FASTSTEP) {
-    for (int v = 0; v < 256; v += FASTSTEP) {
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
        int r0, g0, b0, r1, g1, b1;
        int y = RANDOM256(y2);
        YUVVToRGBReference(y, u, v, &r0, &g0, &b0);
        YUVVToRGB(y, u, v, &r1, &g1, &b1);
-        ASSERT_NEAR(r0, r1, ERROR_R);
-        ASSERT_NEAR(g0, g1, 2);
-        ASSERT_NEAR(b0, b1, ERROR_B);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, 2);
+        EXPECT_NEAR(b0, b1, ERROR_B);
        ++rh[r1 - r0 + 128];
        ++gh[g1 - g0 + 128];
        ++bh[b1 - b0 + 128];
@ -791,8 +794,6 @@ TEST_F(LibYUVColorTest, TestFullYUVV) {
  }
  PrintHistogram(rh, gh, bh);
 }
-#endif  // DISABLE_SLOW_TESTS
-
 #undef FASTSTEP

 TEST_F(LibYUVColorTest, TestGreyYUVJ) {
@ -800,47 +801,47 @@ TEST_F(LibYUVColorTest, TestGreyYUVJ) {

  // black
  YUVJToRGBReference(0, 128, 128, &r0, &g0, &b0);
-  ASSERT_EQ(0, r0);
-  ASSERT_EQ(0, g0);
-  ASSERT_EQ(0, b0);
+  EXPECT_EQ(0, r0);
+  EXPECT_EQ(0, g0);
+  EXPECT_EQ(0, b0);

  YUVJToRGB(0, 128, 128, &r1, &g1, &b1);
-  ASSERT_EQ(0, r1);
-  ASSERT_EQ(0, g1);
-  ASSERT_EQ(0, b1);
+  EXPECT_EQ(0, r1);
+  EXPECT_EQ(0, g1);
+  EXPECT_EQ(0, b1);

  // white
  YUVJToRGBReference(255, 128, 128, &r0, &g0, &b0);
-  ASSERT_EQ(255, r0);
-  ASSERT_EQ(255, g0);
-  ASSERT_EQ(255, b0);
+  EXPECT_EQ(255, r0);
+  EXPECT_EQ(255, g0);
+  EXPECT_EQ(255, b0);

  YUVJToRGB(255, 128, 128, &r1, &g1, &b1);
-  ASSERT_EQ(255, r1);
-  ASSERT_EQ(255, g1);
-  ASSERT_EQ(255, b1);
+  EXPECT_EQ(255, r1);
+  EXPECT_EQ(255, g1);
+  EXPECT_EQ(255, b1);

  // grey
  YUVJToRGBReference(128, 128, 128, &r0, &g0, &b0);
-  ASSERT_EQ(128, r0);
-  ASSERT_EQ(128, g0);
-  ASSERT_EQ(128, b0);
+  EXPECT_EQ(128, r0);
+  EXPECT_EQ(128, g0);
+  EXPECT_EQ(128, b0);

  YUVJToRGB(128, 128, 128, &r1, &g1, &b1);
-  ASSERT_EQ(128, r1);
-  ASSERT_EQ(128, g1);
-  ASSERT_EQ(128, b1);
+  EXPECT_EQ(128, r1);
+  EXPECT_EQ(128, g1);
+  EXPECT_EQ(128, b1);

  for (int y = 0; y < 256; ++y) {
    YUVJToRGBReference(y, 128, 128, &r0, &g0, &b0);
    YUVJToRGB(y, 128, 128, &r1, &g1, &b1);
    YJToRGB(y, &r2, &g2, &b2);
-    ASSERT_EQ(r0, r1);
-    ASSERT_EQ(g0, g1);
-    ASSERT_EQ(b0, b1);
-    ASSERT_EQ(r0, r2);
-    ASSERT_EQ(g0, g2);
-    ASSERT_EQ(b0, b2);
+    EXPECT_EQ(r0, r1);
+    EXPECT_EQ(g0, g1);
+    EXPECT_EQ(b0, b1);
+    EXPECT_EQ(r0, r2);
+    EXPECT_EQ(g0, g2);
+    EXPECT_EQ(b0, b2);
  }
 }

--- a/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
@ -48,7 +48,7 @@ TEST_F(LibYUVCompareTest, Djb2_Test) {
      " together with Hermann Zapf";
  uint32_t foxhash = HashDjb2(reinterpret_cast<const uint8_t*>(fox), 131, 5381);
  const uint32_t kExpectedFoxHash = 2611006483u;
-  ASSERT_EQ(kExpectedFoxHash, foxhash);
+  EXPECT_EQ(kExpectedFoxHash, foxhash);

  for (int i = 0; i < kMaxTest; ++i) {
    src_a[i] = (fastrand() & 0xff);
@ -57,13 +57,13 @@ TEST_F(LibYUVCompareTest, Djb2_Test) {
  // Compare different buffers. Expect hash is different.
  uint32_t h1 = HashDjb2(src_a, kMaxTest, 5381);
  uint32_t h2 = HashDjb2(src_b, kMaxTest, 5381);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);

  // Make last half same. Expect hash is different.
  memcpy(src_a + kMaxTest / 2, src_b + kMaxTest / 2, kMaxTest / 2);
  h1 = HashDjb2(src_a, kMaxTest, 5381);
  h2 = HashDjb2(src_b, kMaxTest, 5381);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);

  // Make first half same. Expect hash is different.
  memcpy(src_a + kMaxTest / 2, src_a, kMaxTest / 2);
@ -71,52 +71,52 @@ TEST_F(LibYUVCompareTest, Djb2_Test) {
  memcpy(src_a, src_b, kMaxTest / 2);
  h1 = HashDjb2(src_a, kMaxTest, 5381);
  h2 = HashDjb2(src_b, kMaxTest, 5381);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);

  // Make same. Expect hash is same.
  memcpy(src_a, src_b, kMaxTest);
  h1 = HashDjb2(src_a, kMaxTest, 5381);
  h2 = HashDjb2(src_b, kMaxTest, 5381);
-  ASSERT_EQ(h1, h2);
+  EXPECT_EQ(h1, h2);

  // Mask seed different. Expect hash is different.
  memcpy(src_a, src_b, kMaxTest);
  h1 = HashDjb2(src_a, kMaxTest, 5381);
  h2 = HashDjb2(src_b, kMaxTest, 1234);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);

  // Make one byte different in middle. Expect hash is different.
  memcpy(src_a, src_b, kMaxTest);
  ++src_b[kMaxTest / 2];
  h1 = HashDjb2(src_a, kMaxTest, 5381);
  h2 = HashDjb2(src_b, kMaxTest, 5381);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);

  // Make first byte different. Expect hash is different.
  memcpy(src_a, src_b, kMaxTest);
  ++src_b[0];
  h1 = HashDjb2(src_a, kMaxTest, 5381);
  h2 = HashDjb2(src_b, kMaxTest, 5381);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);

  // Make last byte different. Expect hash is different.
  memcpy(src_a, src_b, kMaxTest);
  ++src_b[kMaxTest - 1];
  h1 = HashDjb2(src_a, kMaxTest, 5381);
  h2 = HashDjb2(src_b, kMaxTest, 5381);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);

  // Make a zeros. Test different lengths. Expect hash is different.
  memset(src_a, 0, kMaxTest);
  h1 = HashDjb2(src_a, kMaxTest, 5381);
  h2 = HashDjb2(src_a, kMaxTest / 2, 5381);
-  ASSERT_NE(h1, h2);
+  EXPECT_NE(h1, h2);

  // Make a zeros and seed of zero. Test different lengths. Expect hash is same.
  memset(src_a, 0, kMaxTest);
  h1 = HashDjb2(src_a, kMaxTest, 0);
  h2 = HashDjb2(src_a, kMaxTest / 2, 0);
-  ASSERT_EQ(h1, h2);
+  EXPECT_EQ(h1, h2);

  free_aligned_buffer_page_end(src_a);
  free_aligned_buffer_page_end(src_b);
@ -134,7 +134,7 @@ TEST_F(LibYUVCompareTest, BenchmarkDjb2_Opt) {
  for (int i = 0; i < benchmark_iterations_; ++i) {
    h1 = HashDjb2(src_a, kMaxTest, 5381);
  }
-  ASSERT_EQ(h1, h2);
+  EXPECT_EQ(h1, h2);
  free_aligned_buffer_page_end(src_a);
 }

@ -149,7 +149,7 @@ TEST_F(LibYUVCompareTest, BenchmarkDjb2_Unaligned) {
  for (int i = 0; i < benchmark_iterations_; ++i) {
    h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
  }
-  ASSERT_EQ(h1, h2);
+  EXPECT_EQ(h1, h2);
  free_aligned_buffer_page_end(src_a);
 }

@ -164,19 +164,19 @@ TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Opt) {
  src_a[0] = 0;
  fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
                      benchmark_height_);
-  ASSERT_EQ(static_cast<uint32_t>(libyuv::FOURCC_BGRA), fourcc);
+  EXPECT_EQ(static_cast<uint32_t>(libyuv::FOURCC_BGRA), fourcc);
  src_a[0] = 255;
  src_a[3] = 0;
  fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
                      benchmark_height_);
-  ASSERT_EQ(static_cast<uint32_t>(libyuv::FOURCC_ARGB), fourcc);
+  EXPECT_EQ(static_cast<uint32_t>(libyuv::FOURCC_ARGB), fourcc);
  src_a[3] = 255;

  for (int i = 0; i < benchmark_iterations_; ++i) {
    fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
                        benchmark_height_);
  }
-  ASSERT_EQ(0u, fourcc);
+  EXPECT_EQ(0u, fourcc);

  free_aligned_buffer_page_end(src_a);
 }
@ -192,19 +192,19 @@ TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Unaligned) {
  src_a[0 + 1] = 0;
  fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
                      benchmark_height_);
-  ASSERT_EQ(static_cast<uint32_t>(libyuv::FOURCC_BGRA), fourcc);
+  EXPECT_EQ(static_cast<uint32_t>(libyuv::FOURCC_BGRA), fourcc);
  src_a[0 + 1] = 255;
  src_a[3 + 1] = 0;
  fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
                      benchmark_height_);
-  ASSERT_EQ(static_cast<uint32_t>(libyuv::FOURCC_ARGB), fourcc);
+  EXPECT_EQ(static_cast<uint32_t>(libyuv::FOURCC_ARGB), fourcc);
  src_a[3 + 1] = 255;

  for (int i = 0; i < benchmark_iterations_; ++i) {
    fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
                        benchmark_height_);
  }
-  ASSERT_EQ(0u, fourcc);
+  EXPECT_EQ(0u, fourcc);

  free_aligned_buffer_page_end(src_a);
 }
@ -221,7 +221,7 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
  memcpy(src_a, "test0123test4567", 16);
  memcpy(src_b, "tick0123tock4567", 16);
  uint32_t h1 = HammingDistance_C(src_a, src_b, 16);
-  ASSERT_EQ(16u, h1);
+  EXPECT_EQ(16u, h1);

  // Test C vs OPT on random buffer
  MemRandomize(src_a, kMaxWidth);
@ -263,7 +263,7 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
    h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
 #endif
  }
-  ASSERT_EQ(h0, h1);
+  EXPECT_EQ(h0, h1);

  free_aligned_buffer_page_end(src_a);
  free_aligned_buffer_page_end(src_b);
@ -280,7 +280,7 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_C) {
  memcpy(src_a, "test0123test4567", 16);
  memcpy(src_b, "tick0123tock4567", 16);
  uint32_t h1 = HammingDistance_C(src_a, src_b, 16);
-  ASSERT_EQ(16u, h1);
+  EXPECT_EQ(16u, h1);

  // Test C vs OPT on random buffer
  MemRandomize(src_a, kMaxWidth);
@ -295,7 +295,7 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_C) {
    h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
  }

-  ASSERT_EQ(h0, h1);
+  EXPECT_EQ(h0, h1);

  free_aligned_buffer_page_end(src_a);
  free_aligned_buffer_page_end(src_b);
@ -311,7 +311,7 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance) {
  memcpy(src_a, "test0123test4567", 16);
  memcpy(src_b, "tick0123tock4567", 16);
  uint64_t h1 = ComputeHammingDistance(src_a, src_b, 16);
-  ASSERT_EQ(16u, h1);
+  EXPECT_EQ(16u, h1);

  // Test C vs OPT on random buffer
  MemRandomize(src_a, kMaxWidth);
@ -326,7 +326,7 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance) {
    h1 = ComputeHammingDistance(src_a, src_b, kMaxWidth);
  }

-  ASSERT_EQ(h0, h1);
+  EXPECT_EQ(h0, h1);

  free_aligned_buffer_page_end(src_a);
  free_aligned_buffer_page_end(src_b);
@ -351,7 +351,7 @@ TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
  memset(src_b, 0u, kMaxWidth);

  uint64_t h0 = ComputeHammingDistance(src_a, src_b, kMaxWidth);
-  ASSERT_EQ(kMaxWidth * 8ULL, h0);
+  EXPECT_EQ(kMaxWidth * 8ULL, h0);

  for (int i = 0; i < benchmark_iterations_; ++i) {
 #if defined(HAS_HAMMINGDISTANCE_NEON)
@ -389,7 +389,7 @@ TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
  // result can not be expected to be correct.
  // TODO(fbarchard): Consider expecting the low 16 bits to match.
  if (kMaxWidth <= kMaxOptCount) {
-    ASSERT_EQ(kMaxWidth * 8U, h1);
+    EXPECT_EQ(kMaxWidth * 8U, h1);
  } else {
    if (kMaxWidth * 8ULL != static_cast<uint64_t>(h1)) {
      printf(
@ -420,7 +420,7 @@ TEST_F(LibYUVCompareTest, TestHammingDistance) {
    h1 = ComputeHammingDistance(src_a, src_b,
                                benchmark_width_ * benchmark_height_);
  }
-  ASSERT_EQ(benchmark_width_ * benchmark_height_ * 8ULL, h1);
+  EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8ULL, h1);

  free_aligned_buffer_page_end(src_a);
  free_aligned_buffer_page_end(src_b);
@ -436,7 +436,7 @@ TEST_F(LibYUVCompareTest, BenchmarkSumSquareError_Opt) {
  memcpy(src_a, "test0123test4567", 16);
  memcpy(src_b, "tick0123tock4567", 16);
  uint64_t h1 = ComputeSumSquareError(src_a, src_b, 16);
-  ASSERT_EQ(790u, h1);
+  EXPECT_EQ(790u, h1);

  for (int i = 0; i < kMaxWidth; ++i) {
    src_a[i] = i;
@ -452,7 +452,7 @@ TEST_F(LibYUVCompareTest, BenchmarkSumSquareError_Opt) {
    h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth);
  }

-  ASSERT_EQ(0u, h1);
+  EXPECT_EQ(0u, h1);

  free_aligned_buffer_page_end(src_a);
  free_aligned_buffer_page_end(src_b);
@ -468,18 +468,18 @@ TEST_F(LibYUVCompareTest, SumSquareError) {
  uint64_t err;
  err = ComputeSumSquareError(src_a, src_b, kMaxWidth);

-  ASSERT_EQ(0u, err);
+  EXPECT_EQ(0u, err);

  memset(src_a, 1, kMaxWidth);
  err = ComputeSumSquareError(src_a, src_b, kMaxWidth);

-  ASSERT_EQ(static_cast<int>(err), kMaxWidth);
+  EXPECT_EQ(static_cast<int>(err), kMaxWidth);

  memset(src_a, 190, kMaxWidth);
  memset(src_b, 193, kMaxWidth);
  err = ComputeSumSquareError(src_a, src_b, kMaxWidth);

-  ASSERT_EQ(static_cast<int>(err), kMaxWidth * 3 * 3);
+  EXPECT_EQ(static_cast<int>(err), kMaxWidth * 3 * 3);

  for (int i = 0; i < kMaxWidth; ++i) {
    src_a[i] = (fastrand() & 0xff);
@ -492,7 +492,7 @@ TEST_F(LibYUVCompareTest, SumSquareError) {
  MaskCpuFlags(benchmark_cpu_info_);
  uint64_t opt_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);

-  ASSERT_EQ(c_err, opt_err);
+  EXPECT_EQ(c_err, opt_err);

  free_aligned_buffer_page_end(src_a);
  free_aligned_buffer_page_end(src_b);
@ -517,7 +517,7 @@ TEST_F(LibYUVCompareTest, BenchmarkPsnr_Opt) {
  opt_time = (get_time() - opt_time) / benchmark_iterations_;
  printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);

-  ASSERT_EQ(0, 0);
+  EXPECT_EQ(0, 0);

  free_aligned_buffer_page_end(src_a);
  free_aligned_buffer_page_end(src_b);
@ -542,7 +542,7 @@ TEST_F(LibYUVCompareTest, BenchmarkPsnr_Unaligned) {
  opt_time = (get_time() - opt_time) / benchmark_iterations_;
  printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);

-  ASSERT_EQ(0, 0);
+  EXPECT_EQ(0, 0);

  free_aligned_buffer_page_end(src_a);
  free_aligned_buffer_page_end(src_b);
@ -564,7 +564,7 @@ TEST_F(LibYUVCompareTest, Psnr) {
                      src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
                      kSrcHeight);

-  ASSERT_EQ(err, kMaxPsnr);
+  EXPECT_EQ(err, kMaxPsnr);

  memset(src_a, 255, kSrcPlaneSize);

@ -572,7 +572,7 @@ TEST_F(LibYUVCompareTest, Psnr) {
                      src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
                      kSrcHeight);

-  ASSERT_EQ(err, 0.0);
+  EXPECT_EQ(err, 0.0);

  memset(src_a, 1, kSrcPlaneSize);

@ -580,8 +580,8 @@ TEST_F(LibYUVCompareTest, Psnr) {
                      src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
                      kSrcHeight);

-  ASSERT_GT(err, 48.0);
-  ASSERT_LT(err, 49.0);
+  EXPECT_GT(err, 48.0);
+  EXPECT_LT(err, 49.0);

  for (int i = 0; i < kSrcPlaneSize; ++i) {
    src_a[i] = i;
@ -591,9 +591,9 @@ TEST_F(LibYUVCompareTest, Psnr) {
                      src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
                      kSrcHeight);

-  ASSERT_GT(err, 2.0);
+  EXPECT_GT(err, 2.0);
  if (kSrcWidth * kSrcHeight >= 256) {
-    ASSERT_LT(err, 6.0);
+    EXPECT_LT(err, 6.0);
  }

  memset(src_a, 0, kSrcPlaneSize);
@ -619,7 +619,7 @@ TEST_F(LibYUVCompareTest, Psnr) {
                          src_b + kSrcStride * b + b, kSrcStride, kSrcWidth,
                          kSrcHeight);

-  ASSERT_EQ(opt_err, c_err);
+  EXPECT_EQ(opt_err, c_err);

  free_aligned_buffer_page_end(src_a);
  free_aligned_buffer_page_end(src_b);
@ -644,7 +644,7 @@ TEST_F(LibYUVCompareTest, DISABLED_BenchmarkSsim_Opt) {
  opt_time = (get_time() - opt_time) / benchmark_iterations_;
  printf("BenchmarkSsim_Opt - %8.2f us opt\n", opt_time * 1e6);

-  ASSERT_EQ(0, 0);  // Pass if we get this far.
+  EXPECT_EQ(0, 0);  // Pass if we get this far.

  free_aligned_buffer_page_end(src_a);
  free_aligned_buffer_page_end(src_b);
@ -671,7 +671,7 @@ TEST_F(LibYUVCompareTest, Ssim) {
                      kSrcHeight);

  if (kSrcWidth > 8 && kSrcHeight > 8) {
-    ASSERT_EQ(err, 1.0);
+    EXPECT_EQ(err, 1.0);
  }

  memset(src_a, 255, kSrcPlaneSize);
@ -681,7 +681,7 @@ TEST_F(LibYUVCompareTest, Ssim) {
                      kSrcHeight);

  if (kSrcWidth > 8 && kSrcHeight > 8) {
-    ASSERT_LT(err, 0.0001);
+    EXPECT_LT(err, 0.0001);
  }

  memset(src_a, 1, kSrcPlaneSize);
@ -691,8 +691,8 @@ TEST_F(LibYUVCompareTest, Ssim) {
                      kSrcHeight);

  if (kSrcWidth > 8 && kSrcHeight > 8) {
-    ASSERT_GT(err, 0.0001);
-    ASSERT_LT(err, 0.9);
+    EXPECT_GT(err, 0.0001);
+    EXPECT_LT(err, 0.9);
  }

  for (int i = 0; i < kSrcPlaneSize; ++i) {
@ -704,8 +704,8 @@ TEST_F(LibYUVCompareTest, Ssim) {
                      kSrcHeight);

  if (kSrcWidth > 8 && kSrcHeight > 8) {
-    ASSERT_GT(err, 0.0);
-    ASSERT_LT(err, 0.01);
+    EXPECT_GT(err, 0.0);
+    EXPECT_LT(err, 0.01);
  }

  for (int i = b; i < (kSrcHeight + b); ++i) {
@ -729,7 +729,7 @@ TEST_F(LibYUVCompareTest, Ssim) {
                          kSrcHeight);

  if (kSrcWidth > 8 && kSrcHeight > 8) {
-    ASSERT_EQ(opt_err, c_err);
+    EXPECT_EQ(opt_err, c_err);
  }

  free_aligned_buffer_page_end(src_a);
--- a/unit_test/convert_argb_test.cc
+++ b/unit_test/convert_argb_test.cc
@ -53,9 +53,9 @@ namespace libyuv {
 #define ABGRToABGR ARGBCopy

 // subsample amount uses a divide.
-#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))

-#define ALIGNINT(V, ALIGN) (((V) + (ALIGN) - 1) / (ALIGN) * (ALIGN))
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))

 #define TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
@ -82,19 +82,15 @@ namespace libyuv {
        (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                    \
    const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);    \
    const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y);  \
-    align_buffer_page_end(src_y,                                               \
-                          kPaddedWidth * kPaddedHeight * SRC_BPC + OFF);       \
+    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF);  \
    align_buffer_page_end(                                                     \
-        src_uv,                                                                \
-        kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC * 2 + OFF);       \
-    align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC);                \
-    align_buffer_page_end(dst_u_c, kDstHalfWidth * kDstHalfHeight * DST_BPC);  \
-    align_buffer_page_end(dst_v_c, kDstHalfWidth * kDstHalfHeight * DST_BPC);  \
-    align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC);              \
-    align_buffer_page_end(dst_u_opt,                                           \
-                          kDstHalfWidth * kDstHalfHeight * DST_BPC);           \
-    align_buffer_page_end(dst_v_opt,                                           \
-                          kDstHalfWidth * kDstHalfHeight * DST_BPC);           \
+        src_uv, kSrcHalfPaddedWidth* kSrcHalfPaddedHeight* SRC_BPC * 2 + OFF); \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                  \
+    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);    \
+    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);    \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);                \
+    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC);  \
+    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC);  \
    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                    \
    SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                  \
    for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) {                   \
@ -105,12 +101,12 @@ namespace libyuv {
      src_uv_p[i] =                                                            \
          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
    }                                                                          \
-    memset(dst_y_c, 1, kWidth * kHeight * DST_BPC);                            \
-    memset(dst_u_c, 2, kDstHalfWidth * kDstHalfHeight * DST_BPC);              \
-    memset(dst_v_c, 3, kDstHalfWidth * kDstHalfHeight * DST_BPC);              \
-    memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC);                        \
-    memset(dst_u_opt, 102, kDstHalfWidth * kDstHalfHeight * DST_BPC);          \
-    memset(dst_v_opt, 103, kDstHalfWidth * kDstHalfHeight * DST_BPC);          \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                              \
+    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);                \
+    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);                \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                          \
+    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);            \
+    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);            \
    MaskCpuFlags(disable_cpu_flags_);                                          \
    SRC_FMT_PLANAR##To##FMT_PLANAR(                                            \
        src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2,                          \
@ -128,11 +124,11 @@ namespace libyuv {
          NEG kHeight);                                                        \
    }                                                                          \
    for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                     \
-      ASSERT_EQ(dst_y_c[i], dst_y_opt[i]);                                     \
+      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                     \
    }                                                                          \
    for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) {       \
-      ASSERT_EQ(dst_u_c[i], dst_u_opt[i]);                                     \
-      ASSERT_EQ(dst_v_c[i], dst_v_opt[i]);                                     \
+      EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);                                     \
+      EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);                                     \
    }                                                                          \
    free_aligned_buffer_page_end(dst_y_c);                                     \
    free_aligned_buffer_page_end(dst_u_c);                                     \
@ -227,11 +223,11 @@ TESTBPTOP(P012, uint16_t, 2, 2, 2, I012, uint16_t, 2, 2, 2, 12, 1, 1)
    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                     \
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
    align_buffer_page_end(src_u, kSizeUV + OFF);                              \
    align_buffer_page_end(src_v, kSizeUV + OFF);                              \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);              \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);            \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);             \
    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
      src_y[i + OFF] = (fastrand() & 0xff);                                   \
    }                                                                         \
@ -258,7 +254,7 @@ TESTBPTOP(P012, uint16_t, 2, 2, 2, I012, uint16_t, 2, 2, 2, 12, 1, 1)
           static_cast<int>((time1 - time0) * 1e6),                           \
           static_cast<int>((time2 - time1) * 1e6 / benchmark_iterations_));  \
    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                      \
-      ASSERT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                  \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                  \
    }                                                                         \
    free_aligned_buffer_page_end(src_y);                                      \
    free_aligned_buffer_page_end(src_u);                                      \
@ -385,58 +381,58 @@ TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
 #endif

-#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,     \
-                   W1280, N, NEG, OFF)                                        \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                       \
-    const int kWidth = W1280;                                                 \
-    const int kHeight = benchmark_height_;                                    \
-    const int kStrideB = kWidth * BPP_B;                                      \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
-    align_buffer_page_end(                                                    \
-        src_uv, kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF);         \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight);                    \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight);                  \
-    for (int i = 0; i < kHeight; ++i)                                         \
-      for (int j = 0; j < kWidth; ++j)                                        \
-        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                    \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < kStrideUV * 2; ++j) {                               \
-        src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff);            \
-      }                                                                       \
-    }                                                                         \
-    memset(dst_argb_c, 1, kStrideB * kHeight);                                \
-    memset(dst_argb_opt, 101, kStrideB * kHeight);                            \
-    MaskCpuFlags(disable_cpu_flags_);                                         \
-    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,   \
-                          dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight);   \
-    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \
-                            dst_argb_opt, kWidth * BPP_B, kWidth,             \
-                            NEG kHeight);                                     \
-    }                                                                         \
-    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */   \
-    align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight);                \
-    align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight);              \
-    memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                            \
-    memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                        \
-    FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,     \
-                  kHeight);                                                   \
-    FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \
-                  kHeight);                                                   \
-    for (int i = 0; i < kHeight; ++i) {                                       \
-      for (int j = 0; j < kWidth * 4; ++j) {                                  \
-        ASSERT_EQ(dst_argb32_c[i * kWidth * 4 + j],                           \
-                  dst_argb32_opt[i * kWidth * 4 + j]);                        \
-      }                                                                       \
-    }                                                                         \
-    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_uv);                                     \
-    free_aligned_buffer_page_end(dst_argb_c);                                 \
-    free_aligned_buffer_page_end(dst_argb_opt);                               \
-    free_aligned_buffer_page_end(dst_argb32_c);                               \
-    free_aligned_buffer_page_end(dst_argb32_opt);                             \
+#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   W1280, N, NEG, OFF)                                         \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = benchmark_height_;                                     \
+    const int kStrideB = kWidth * BPP_B;                                       \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_uv,                                              \
+                          kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight);                      \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight);                    \
+    for (int i = 0; i < kHeight; ++i)                                          \
+      for (int j = 0; j < kWidth; ++j)                                         \
+        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                     \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
+      for (int j = 0; j < kStrideUV * 2; ++j) {                                \
+        src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff);             \
+      }                                                                        \
+    }                                                                          \
+    memset(dst_argb_c, 1, kStrideB* kHeight);                                  \
+    memset(dst_argb_opt, 101, kStrideB* kHeight);                              \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,    \
+                          dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight);    \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,  \
+                            dst_argb_opt, kWidth * BPP_B, kWidth,              \
+                            NEG kHeight);                                      \
+    }                                                                          \
+    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */    \
+    align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight);                 \
+    align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight);               \
+    memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                             \
+    memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                         \
+    FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,      \
+                  kHeight);                                                    \
+    FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth,  \
+                  kHeight);                                                    \
+    for (int i = 0; i < kHeight; ++i) {                                        \
+      for (int j = 0; j < kWidth * 4; ++j) {                                   \
+        EXPECT_EQ(dst_argb32_c[i * kWidth * 4 + j],                            \
+                  dst_argb32_opt[i * kWidth * 4 + j]);                         \
+      }                                                                        \
+    }                                                                          \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_uv);                                      \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+    free_aligned_buffer_page_end(dst_argb32_c);                                \
+    free_aligned_buffer_page_end(dst_argb32_opt);                              \
  }

 #if defined(ENABLE_FULL_TESTS)
@ -511,16 +507,15 @@ TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2)
    const int kStrideB =                                                       \
        (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                 \
    align_buffer_page_end(src_argb,                                            \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);    \
-    align_buffer_page_end(dst_argb_c,                                          \
-                          kStrideB * kHeightB * (int)sizeof(TYPE_B));          \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeightB*(int)sizeof(TYPE_B)); \
    align_buffer_page_end(dst_argb_opt,                                        \
-                          kStrideB * kHeightB * (int)sizeof(TYPE_B));          \
+                          kStrideB* kHeightB*(int)sizeof(TYPE_B));             \
    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
      src_argb[i + OFF] = (fastrand() & 0xff);                                 \
    }                                                                          \
-    memset(dst_argb_c, 1, kStrideB * kHeightB);                                \
-    memset(dst_argb_opt, 101, kStrideB * kHeightB);                            \
+    memset(dst_argb_c, 1, kStrideB* kHeightB);                                 \
+    memset(dst_argb_opt, 101, kStrideB* kHeightB);                             \
    MaskCpuFlags(disable_cpu_flags_);                                          \
    FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_B*)dst_argb_c, \
                     kStrideB, kWidth, NEG kHeight);                           \
@ -530,49 +525,48 @@ TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2)
                       (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight);  \
    }                                                                          \
    for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {      \
-      ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                               \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                               \
    }                                                                          \
    free_aligned_buffer_page_end(src_argb);                                    \
    free_aligned_buffer_page_end(dst_argb_c);                                  \
    free_aligned_buffer_page_end(dst_argb_opt);                                \
  }

-#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B,     \
-                       TYPE_B, EPP_B, STRIDE_B, HEIGHT_B)                   \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                    \
-    for (int times = 0; times < benchmark_iterations_; ++times) {           \
-      const int kWidth = (fastrand() & 63) + 1;                             \
-      const int kHeight = (fastrand() & 31) + 1;                            \
-      const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;  \
-      const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;  \
-      const int kStrideA =                                                  \
-          (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;            \
-      const int kStrideB =                                                  \
-          (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;            \
-      align_buffer_page_end(src_argb,                                       \
-                            kStrideA * kHeightA * (int)sizeof(TYPE_A));     \
-      align_buffer_page_end(dst_argb_c,                                     \
-                            kStrideB * kHeightB * (int)sizeof(TYPE_B));     \
-      align_buffer_page_end(dst_argb_opt,                                   \
-                            kStrideB * kHeightB * (int)sizeof(TYPE_B));     \
-      for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \
-        src_argb[i] = 0xfe;                                                 \
-      }                                                                     \
-      memset(dst_argb_c, 123, kStrideB * kHeightB);                         \
-      memset(dst_argb_opt, 123, kStrideB * kHeightB);                       \
-      MaskCpuFlags(disable_cpu_flags_);                                     \
-      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c,    \
-                       kStrideB, kWidth, kHeight);                          \
-      MaskCpuFlags(benchmark_cpu_info_);                                    \
-      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt,  \
-                       kStrideB, kWidth, kHeight);                          \
-      for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) { \
-        ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                          \
-      }                                                                     \
-      free_aligned_buffer_page_end(src_argb);                               \
-      free_aligned_buffer_page_end(dst_argb_c);                             \
-      free_aligned_buffer_page_end(dst_argb_opt);                           \
-    }                                                                       \
+#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B,        \
+                       TYPE_B, EPP_B, STRIDE_B, HEIGHT_B)                      \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                       \
+    for (int times = 0; times < benchmark_iterations_; ++times) {              \
+      const int kWidth = (fastrand() & 63) + 1;                                \
+      const int kHeight = (fastrand() & 31) + 1;                               \
+      const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;     \
+      const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;     \
+      const int kStrideA =                                                     \
+          (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
+      const int kStrideB =                                                     \
+          (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
+      align_buffer_page_end(src_argb, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
+      align_buffer_page_end(dst_argb_c,                                        \
+                            kStrideB* kHeightB*(int)sizeof(TYPE_B));           \
+      align_buffer_page_end(dst_argb_opt,                                      \
+                            kStrideB* kHeightB*(int)sizeof(TYPE_B));           \
+      for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {    \
+        src_argb[i] = 0xfe;                                                    \
+      }                                                                        \
+      memset(dst_argb_c, 123, kStrideB* kHeightB);                             \
+      memset(dst_argb_opt, 123, kStrideB* kHeightB);                           \
+      MaskCpuFlags(disable_cpu_flags_);                                        \
+      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c,       \
+                       kStrideB, kWidth, kHeight);                             \
+      MaskCpuFlags(benchmark_cpu_info_);                                       \
+      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt,     \
+                       kStrideB, kWidth, kHeight);                             \
+      for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {    \
+        EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
+      }                                                                        \
+      free_aligned_buffer_page_end(src_argb);                                  \
+      free_aligned_buffer_page_end(dst_argb_c);                                \
+      free_aligned_buffer_page_end(dst_argb_opt);                              \
+    }                                                                          \
  }

 #if defined(ENABLE_FULL_TESTS)
@ -678,11 +672,11 @@ TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
    const int kStrideB =                                                      \
        (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                \
    align_buffer_page_end(src_argb,                                           \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);   \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
    align_buffer_page_end(dst_argb_c,                                         \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);   \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
    align_buffer_page_end(dst_argb_opt,                                       \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);   \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {     \
      src_argb[i + OFF] = (fastrand() & 0xff);                                \
    }                                                                         \
@ -703,7 +697,7 @@ TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
    FMT_A##To##FMT_B((TYPE_A*)(dst_argb_opt /* src */ + OFF), kStrideA,       \
                     (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight);   \
    for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {     \
-      ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                              \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                              \
    }                                                                         \
    free_aligned_buffer_page_end(src_argb);                                   \
    free_aligned_buffer_page_end(dst_argb_c);                                 \
@ -797,14 +791,14 @@ TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
        (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
    const int kStrideB =                                                     \
        (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
-    align_buffer_page_end(src_argb, kStrideA * kHeightA + OFF);              \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                  \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                \
+    align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF);               \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);                   \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);                 \
    for (int i = 0; i < kStrideA * kHeightA; ++i) {                          \
      src_argb[i + OFF] = (fastrand() & 0xff);                               \
    }                                                                        \
-    memset(dst_argb_c, 1, kStrideB * kHeightB);                              \
-    memset(dst_argb_opt, 101, kStrideB * kHeightB);                          \
+    memset(dst_argb_c, 1, kStrideB* kHeightB);                               \
+    memset(dst_argb_opt, 101, kStrideB* kHeightB);                           \
    MaskCpuFlags(disable_cpu_flags_);                                        \
    FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, \
                             NULL, kWidth, NEG kHeight);                     \
@ -814,7 +808,7 @@ TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
                               kStrideB, NULL, kWidth, NEG kHeight);         \
    }                                                                        \
    for (int i = 0; i < kStrideB * kHeightB; ++i) {                          \
-      ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
    }                                                                        \
    free_aligned_buffer_page_end(src_argb);                                  \
    free_aligned_buffer_page_end(dst_argb_c);                                \
@ -833,14 +827,14 @@ TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
          (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
      const int kStrideB =                                                     \
          (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
-      align_buffer_page_end(src_argb, kStrideA * kHeightA);                    \
-      align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                  \
-      align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                \
+      align_buffer_page_end(src_argb, kStrideA* kHeightA);                     \
+      align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);                   \
+      align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);                 \
      for (int i = 0; i < kStrideA * kHeightA; ++i) {                          \
        src_argb[i] = (fastrand() & 0xff);                                     \
      }                                                                        \
-      memset(dst_argb_c, 123, kStrideB * kHeightB);                            \
-      memset(dst_argb_opt, 123, kStrideB * kHeightB);                          \
+      memset(dst_argb_c, 123, kStrideB* kHeightB);                             \
+      memset(dst_argb_opt, 123, kStrideB* kHeightB);                           \
      MaskCpuFlags(disable_cpu_flags_);                                        \
      FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_c, kStrideB, NULL, \
                               kWidth, kHeight);                               \
@ -848,7 +842,7 @@ TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
      FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_opt, kStrideB,     \
                               NULL, kWidth, kHeight);                         \
      for (int i = 0; i < kStrideB * kHeightB; ++i) {                          \
-        ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
+        EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
      }                                                                        \
      free_aligned_buffer_page_end(src_argb);                                  \
      free_aligned_buffer_page_end(dst_argb_c);                                \
@ -891,16 +885,15 @@ TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
    const int kStrideA =                                                       \
        (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                 \
    align_buffer_page_end(src_argb,                                            \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);    \
-    align_buffer_page_end(dst_argb_c,                                          \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A));          \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
+    align_buffer_page_end(dst_argb_c, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
    align_buffer_page_end(dst_argb_opt,                                        \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A));          \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A));             \
    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
      src_argb[i + OFF] = (fastrand() & 0xff);                                 \
    }                                                                          \
-    memset(dst_argb_c, 1, kStrideA * kHeightA);                                \
-    memset(dst_argb_opt, 101, kStrideA * kHeightA);                            \
+    memset(dst_argb_c, 1, kStrideA* kHeightA);                                 \
+    memset(dst_argb_opt, 101, kStrideA* kHeightA);                             \
    MaskCpuFlags(disable_cpu_flags_);                                          \
    FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_c,         \
             kStrideA, kWidth, NEG kHeight);                                   \
@ -916,8 +909,8 @@ TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
    FMT_ATOB((TYPE_A*)dst_argb_opt, kStrideA, (TYPE_A*)dst_argb_opt, kStrideA, \
             kWidth, NEG kHeight);                                             \
    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
-      ASSERT_EQ(src_argb[i + OFF], dst_argb_opt[i]);                           \
-      ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                               \
+      EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]);                           \
+      EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);                               \
    }                                                                          \
    free_aligned_buffer_page_end(src_argb);                                    \
    free_aligned_buffer_page_end(dst_argb_c);                                  \
@ -952,12 +945,12 @@ TESTEND(AB64ToAR64, uint16_t, 4, 4, 1)
    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                      \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
    align_buffer_page_end(src_u, kSizeUV + OFF);                               \
    align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_a, kWidth * kHeight + OFF);                      \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);             \
+    align_buffer_page_end(src_a, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
      src_y[i + OFF] = (fastrand() & 0xff);                                    \
      src_a[i + OFF] = (fastrand() & 0xff);                                    \
@ -981,7 +974,7 @@ TESTEND(AB64ToAR64, uint16_t, 4, 4, 1)
                            ATTEN);                                            \
    }                                                                          \
    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
-      ASSERT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                   \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                   \
    }                                                                          \
    free_aligned_buffer_page_end(src_y);                                       \
    free_aligned_buffer_page_end(src_u);                                       \
@ -1171,7 +1164,7 @@ TEST_F(LibYUVConvertTest, TestYToARGB) {
           argb[i * 4 + 0], argb[i * 4 + 1], argb[i * 4 + 2], argb[i * 4 + 3]);
  }
  for (int i = 0; i < 32; ++i) {
-    ASSERT_EQ(expectedg[i], argb[i * 4 + 0]);
+    EXPECT_EQ(expectedg[i], argb[i * 4 + 0]);
  }
 }

@ -1193,7 +1186,7 @@ TEST_F(LibYUVConvertTest, TestNoDither) {
                     benchmark_width_ * 2, kNoDither4x4, benchmark_width_,
                     benchmark_height_);
  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
-    ASSERT_EQ(dst_rgb565[i], dst_rgb565dither[i]);
+    EXPECT_EQ(dst_rgb565[i], dst_rgb565dither[i]);
  }

  free_aligned_buffer_page_end(src_argb);
@ -1230,7 +1223,7 @@ TEST_F(LibYUVConvertTest, TestDither) {
               benchmark_width_ * 4, benchmark_width_, benchmark_height_);

  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
-    ASSERT_NEAR(dst_argb[i], dst_argbdither[i], 9);
+    EXPECT_NEAR(dst_argb[i], dst_argbdither[i], 9);
  }
  free_aligned_buffer_page_end(src_argb);
  free_aligned_buffer_page_end(dst_rgb565);
@ -1247,11 +1240,11 @@ TEST_F(LibYUVConvertTest, TestDither) {
    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                      \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
    align_buffer_page_end(src_u, kSizeUV + OFF);                               \
    align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);             \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
      src_y[i + OFF] = (fastrand() & 0xff);                                    \
    }                                                                          \
@ -1272,16 +1265,16 @@ TEST_F(LibYUVConvertTest, TestDither) {
          dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight);            \
    }                                                                          \
    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */    \
-    align_buffer_page_end(dst_argb32_c, kWidth * BPP_C * kHeight);             \
-    align_buffer_page_end(dst_argb32_opt, kWidth * BPP_C * kHeight);           \
-    memset(dst_argb32_c, 2, kWidth * BPP_C * kHeight);                         \
-    memset(dst_argb32_opt, 102, kWidth * BPP_C * kHeight);                     \
+    align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight);               \
+    align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight);             \
+    memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight);                           \
+    memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight);                       \
    FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \
                     kWidth, kHeight);                                         \
    FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt,             \
                     kWidth * BPP_C, kWidth, kHeight);                         \
    for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                       \
-      ASSERT_EQ(dst_argb32_c[i], dst_argb32_opt[i]);                           \
+      EXPECT_EQ(dst_argb32_c[i], dst_argb32_opt[i]);                           \
    }                                                                          \
    free_aligned_buffer_page_end(src_y);                                       \
    free_aligned_buffer_page_end(src_u);                                       \
@ -1324,10 +1317,10 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
    const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                    \
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
    align_buffer_page_end(src_u, kSizeUV + OFF);                              \
    align_buffer_page_end(src_v, kSizeUV + OFF);                              \
-    align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);               \
    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
      src_y[i + OFF] = (fastrand() & 0xff);                                   \
    }                                                                         \
@ -1341,8 +1334,8 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
                          kWidth, NEG kHeight);                               \
    /* Convert to a 3rd format in 1 step and 2 steps and compare  */          \
    const int kStrideC = kWidth * BPP_C;                                      \
-    align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF);              \
-    align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF);             \
+    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);              \
    memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                          \
    memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                         \
    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
@ -1354,7 +1347,7 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
                       kStrideC, kWidth, kHeight);                            \
    }                                                                         \
    for (int i = 0; i < kStrideC * kHeight; ++i) {                            \
-      ASSERT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                   \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                   \
    }                                                                         \
    free_aligned_buffer_page_end(src_y);                                      \
    free_aligned_buffer_page_end(src_u);                                      \
@ -1471,14 +1464,14 @@ TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
    const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                     \
    const int kSizeUV =                                                        \
        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y);          \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                      \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
    align_buffer_page_end(src_u, kSizeUV + OFF);                               \
    align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_a, kWidth * kHeight + OFF);                      \
-    align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF);               \
+    align_buffer_page_end(src_a, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);                \
    const int kStrideC = kWidth * BPP_C;                                       \
-    align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);               \
    memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                           \
    memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                           \
    memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                          \
@ -1506,7 +1499,7 @@ TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
        src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth,        \
        dst_argb_c + OFF, kStrideC, kWidth, NEG kHeight, ATTEN);               \
    for (int i = 0; i < kStrideC * kHeight; ++i) {                             \
-      ASSERT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                    \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]);                    \
    }                                                                          \
    free_aligned_buffer_page_end(src_y);                                       \
    free_aligned_buffer_page_end(src_u);                                       \
@ -1585,16 +1578,16 @@ TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
    const int kHeight = benchmark_height_;                                     \
    const int kStrideA = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
    const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                     \
-    align_buffer_page_end(src_argb_a, kStrideA * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF);               \
+    align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);                \
    MemRandomize(src_argb_a + OFF, kStrideA * kHeight);                        \
    memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                           \
    FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB,   \
                     kWidth, NEG kHeight);                                     \
    /* Convert to a 3rd format in 1 step and 2 steps and compare  */           \
    const int kStrideC = kWidth * BPP_C;                                       \
-    align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);               \
    memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                           \
    memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                          \
    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
@ -1605,10 +1598,10 @@ TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
                       kStrideC, kWidth, kHeight);                             \
    }                                                                          \
    for (int i = 0; i < kStrideC * kHeight; i += 4) {                          \
-      ASSERT_EQ(dst_argb_c[i + OFF + 0], dst_argb_bc[i + OFF + 0]);            \
-      ASSERT_EQ(dst_argb_c[i + OFF + 1], dst_argb_bc[i + OFF + 1]);            \
-      ASSERT_EQ(dst_argb_c[i + OFF + 2], dst_argb_bc[i + OFF + 2]);            \
-      ASSERT_NEAR(dst_argb_c[i + OFF + 3], dst_argb_bc[i + OFF + 3], 64);      \
+      EXPECT_EQ(dst_argb_c[i + OFF + 0], dst_argb_bc[i + OFF + 0]);            \
+      EXPECT_EQ(dst_argb_c[i + OFF + 1], dst_argb_bc[i + OFF + 1]);            \
+      EXPECT_EQ(dst_argb_c[i + OFF + 2], dst_argb_bc[i + OFF + 2]);            \
+      EXPECT_NEAR(dst_argb_c[i + OFF + 3], dst_argb_bc[i + OFF + 3], 64);      \
    }                                                                          \
    free_aligned_buffer_page_end(src_argb_a);                                  \
    free_aligned_buffer_page_end(dst_argb_b);                                  \
@ -1671,12 +1664,12 @@ TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
                        2,  // crop height
                        kRotate90, FOURCC_ARGB);

-  ASSERT_EQ(r, 0);
+  EXPECT_EQ(r, 0);
  // 90 degrees rotation, no conversion
-  ASSERT_EQ(dst[0], src[2]);
-  ASSERT_EQ(dst[1], src[0]);
-  ASSERT_EQ(dst[2], src[3]);
-  ASSERT_EQ(dst[3], src[1]);
+  EXPECT_EQ(dst[0], src[2]);
+  EXPECT_EQ(dst[1], src[0]);
+  EXPECT_EQ(dst[2], src[3]);
+  EXPECT_EQ(dst[3], src[1]);
 }

 #ifdef HAS_ARGBTOAR30ROW_AVX2
@ -1704,7 +1697,7 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
    }
  }
  for (int i = 0; i < kPixels * 4; ++i) {
-    ASSERT_EQ(dst_opt[i], dst_c[i]);
+    EXPECT_EQ(dst_opt[i], dst_c[i]);
  }

  free_aligned_buffer_page_end(src);
@ -1738,7 +1731,7 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
    }
  }
  for (int i = 0; i < kPixels * 4; ++i) {
-    ASSERT_EQ(dst_opt[i], dst_c[i]);
+    EXPECT_EQ(dst_opt[i], dst_c[i]);
  }

  free_aligned_buffer_page_end(src);
@ -1805,11 +1798,11 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
    const int kBpc = 2;                                                       \
-    align_buffer_page_end(src_y, kWidth * kHeight * kBpc + SOFF);             \
-    align_buffer_page_end(src_u, kSizeUV * kBpc + SOFF);                      \
-    align_buffer_page_end(src_v, kSizeUV * kBpc + SOFF);                      \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + DOFF);             \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + DOFF);           \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);               \
+    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);              \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);            \
    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
      reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & FMT_MASK); \
    }                                                                         \
@ -1834,7 +1827,7 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
          dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight);                \
    }                                                                         \
    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                      \
-      ASSERT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);                \
+      EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);                \
    }                                                                         \
    free_aligned_buffer_page_end(src_y);                                      \
    free_aligned_buffer_page_end(src_u);                                      \
@ -1920,12 +1913,12 @@ TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30Filter, 4, 4, 1)
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
    const int kBpc = 2;                                                        \
-    align_buffer_page_end(src_y, kWidth * kHeight * kBpc + OFF);               \
-    align_buffer_page_end(src_u, kSizeUV * kBpc + OFF);                        \
-    align_buffer_page_end(src_v, kSizeUV * kBpc + OFF);                        \
-    align_buffer_page_end(src_a, kWidth * kHeight * kBpc + OFF);               \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);             \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF);                 \
+    align_buffer_page_end(src_u, kSizeUV* kBpc + OFF);                         \
+    align_buffer_page_end(src_v, kSizeUV* kBpc + OFF);                         \
+    align_buffer_page_end(src_a, kWidth* kHeight* kBpc + OFF);                 \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
      reinterpret_cast<uint16_t*>(src_y + OFF)[i] =                            \
          (fastrand() & ((1 << S_DEPTH) - 1));                                 \
@ -1957,7 +1950,7 @@ TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30Filter, 4, 4, 1)
          dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, ATTEN);           \
    }                                                                          \
    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
-      ASSERT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                   \
+      EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]);                   \
    }                                                                          \
    free_aligned_buffer_page_end(src_y);                                       \
    free_aligned_buffer_page_end(src_u);                                       \
@ -2153,10 +2146,10 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2;                    \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;         \
    const int kBpc = 2;                                                        \
-    align_buffer_page_end(src_y, kWidth * kHeight * kBpc + SOFF);              \
-    align_buffer_page_end(src_uv, kSizeUV * kBpc + SOFF);                      \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + DOFF);              \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + DOFF);            \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);                \
+    align_buffer_page_end(src_uv, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);               \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);             \
    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
      reinterpret_cast<uint16_t*>(src_y + SOFF)[i] =                           \
          (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH)));                 \
@ -2180,7 +2173,7 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
                            NEG kHeight);                                      \
    }                                                                          \
    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                       \
-      ASSERT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);                 \
+      EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);                 \
    }                                                                          \
    free_aligned_buffer_page_end(src_y);                                       \
    free_aligned_buffer_page_end(src_uv);                                      \
@ -2323,10 +2316,10 @@ TEST_F(LibYUVConvertTest, TestH420ToARGB) {
    ++histogram_r[r];
    // Reference formula for Y channel contribution in YUV to RGB conversions:
    int expected_y = Clamp(static_cast<int>((i - 16) * 1.164f + 0.5f));
-    ASSERT_EQ(b, expected_y);
-    ASSERT_EQ(g, expected_y);
-    ASSERT_EQ(r, expected_y);
-    ASSERT_EQ(a, 255);
+    EXPECT_EQ(b, expected_y);
+    EXPECT_EQ(g, expected_y);
+    EXPECT_EQ(r, expected_y);
+    EXPECT_EQ(a, 255);
  }

  int count_b = 0;
@ -2384,10 +2377,10 @@ TEST_F(LibYUVConvertTest, TestH010ToARGB) {
    ++histogram_g[g];
    ++histogram_r[r];
    int expected_y = Clamp(static_cast<int>((i - 64) * 1.164f / 4));
-    ASSERT_NEAR(b, expected_y, 1);
-    ASSERT_NEAR(g, expected_y, 1);
-    ASSERT_NEAR(r, expected_y, 1);
-    ASSERT_EQ(a, 255);
+    EXPECT_NEAR(b, expected_y, 1);
+    EXPECT_NEAR(g, expected_y, 1);
+    EXPECT_NEAR(r, expected_y, 1);
+    EXPECT_EQ(a, 255);
  }

  int count_b = 0;
@ -2448,10 +2441,10 @@ TEST_F(LibYUVConvertTest, TestH010ToAR30) {
    ++histogram_g[g10];
    ++histogram_r[r10];
    int expected_y = Clamp10(static_cast<int>((i - 64) * 1.164f + 0.5));
-    ASSERT_NEAR(b10, expected_y, 4);
-    ASSERT_NEAR(g10, expected_y, 4);
-    ASSERT_NEAR(r10, expected_y, 4);
-    ASSERT_EQ(a2, 3);
+    EXPECT_NEAR(b10, expected_y, 4);
+    EXPECT_NEAR(g10, expected_y, 4);
+    EXPECT_NEAR(r10, expected_y, 4);
+    EXPECT_EQ(a2, 3);
  }

  int count_b = 0;
@ -2512,10 +2505,10 @@ TEST_F(LibYUVConvertTest, TestH010ToAB30) {
    ++histogram_g[g10];
    ++histogram_r[r10];
    int expected_y = Clamp10(static_cast<int>((i - 64) * 1.164f));
-    ASSERT_NEAR(b10, expected_y, 4);
-    ASSERT_NEAR(g10, expected_y, 4);
-    ASSERT_NEAR(r10, expected_y, 4);
-    ASSERT_EQ(a2, 3);
+    EXPECT_NEAR(b10, expected_y, 4);
+    EXPECT_NEAR(g10, expected_y, 4);
+    EXPECT_NEAR(r10, expected_y, 4);
+    EXPECT_EQ(a2, 3);
  }

  int count_b = 0;
@ -2574,10 +2567,10 @@ TEST_F(LibYUVConvertTest, TestH420ToAR30) {
    ++histogram_g[g10];
    ++histogram_r[r10];
    int expected_y = Clamp10(static_cast<int>((i - 16) * 1.164f * 4.f));
-    ASSERT_NEAR(b10, expected_y, 4);
-    ASSERT_NEAR(g10, expected_y, 4);
-    ASSERT_NEAR(r10, expected_y, 4);
-    ASSERT_EQ(a2, 3);
+    EXPECT_NEAR(b10, expected_y, 4);
+    EXPECT_NEAR(g10, expected_y, 4);
+    EXPECT_NEAR(r10, expected_y, 4);
+    EXPECT_EQ(a2, 3);
  }

  int count_b = 0;
@ -2624,34 +2617,34 @@ TEST_F(LibYUVConvertTest, TestI400) {
  I400ToARGBMatrix(orig_i400, 0, argb_pixels_2020_i400, 0, &kYuv2020Constants,
                   kSize, 1);

-  ASSERT_EQ(0, argb_pixels_i400[0]);
-  ASSERT_EQ(0, argb_pixels_j400[0]);
-  ASSERT_EQ(0, argb_pixels_jpeg_i400[0]);
-  ASSERT_EQ(0, argb_pixels_h709_i400[0]);
-  ASSERT_EQ(0, argb_pixels_2020_i400[0]);
-  ASSERT_EQ(0, argb_pixels_i400[16 * 4]);
-  ASSERT_EQ(16, argb_pixels_j400[16 * 4]);
-  ASSERT_EQ(16, argb_pixels_jpeg_i400[16 * 4]);
-  ASSERT_EQ(0, argb_pixels_h709_i400[16 * 4]);
-  ASSERT_EQ(0, argb_pixels_2020_i400[16 * 4]);
-  ASSERT_EQ(130, argb_pixels_i400[128 * 4]);
-  ASSERT_EQ(128, argb_pixels_j400[128 * 4]);
-  ASSERT_EQ(128, argb_pixels_jpeg_i400[128 * 4]);
-  ASSERT_EQ(130, argb_pixels_h709_i400[128 * 4]);
-  ASSERT_EQ(130, argb_pixels_2020_i400[128 * 4]);
-  ASSERT_EQ(255, argb_pixels_i400[255 * 4]);
-  ASSERT_EQ(255, argb_pixels_j400[255 * 4]);
-  ASSERT_EQ(255, argb_pixels_jpeg_i400[255 * 4]);
-  ASSERT_EQ(255, argb_pixels_h709_i400[255 * 4]);
-  ASSERT_EQ(255, argb_pixels_2020_i400[255 * 4]);
+  EXPECT_EQ(0, argb_pixels_i400[0]);
+  EXPECT_EQ(0, argb_pixels_j400[0]);
+  EXPECT_EQ(0, argb_pixels_jpeg_i400[0]);
+  EXPECT_EQ(0, argb_pixels_h709_i400[0]);
+  EXPECT_EQ(0, argb_pixels_2020_i400[0]);
+  EXPECT_EQ(0, argb_pixels_i400[16 * 4]);
+  EXPECT_EQ(16, argb_pixels_j400[16 * 4]);
+  EXPECT_EQ(16, argb_pixels_jpeg_i400[16 * 4]);
+  EXPECT_EQ(0, argb_pixels_h709_i400[16 * 4]);
+  EXPECT_EQ(0, argb_pixels_2020_i400[16 * 4]);
+  EXPECT_EQ(130, argb_pixels_i400[128 * 4]);
+  EXPECT_EQ(128, argb_pixels_j400[128 * 4]);
+  EXPECT_EQ(128, argb_pixels_jpeg_i400[128 * 4]);
+  EXPECT_EQ(130, argb_pixels_h709_i400[128 * 4]);
+  EXPECT_EQ(130, argb_pixels_2020_i400[128 * 4]);
+  EXPECT_EQ(255, argb_pixels_i400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_j400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_jpeg_i400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_h709_i400[255 * 4]);
+  EXPECT_EQ(255, argb_pixels_2020_i400[255 * 4]);

  for (int i = 0; i < kSize * 4; ++i) {
    if ((i & 3) == 3) {
-      ASSERT_EQ(255, argb_pixels_j400[i]);
+      EXPECT_EQ(255, argb_pixels_j400[i]);
    } else {
-      ASSERT_EQ(i / 4, argb_pixels_j400[i]);
+      EXPECT_EQ(i / 4, argb_pixels_j400[i]);
    }
-    ASSERT_EQ(argb_pixels_jpeg_i400[i], argb_pixels_j400[i]);
+    EXPECT_EQ(argb_pixels_jpeg_i400[i], argb_pixels_j400[i]);
  }

  free_aligned_buffer_page_end(orig_i400);
@ -2678,7 +2671,7 @@ TEST_F(LibYUVConvertTest, TestARGBToRGB24) {
  ARGBToRGB24(argb_pixels, 0, dest_rgb24, 0, kSize, 1);

  for (int i = 0; i < kSize * 3; ++i) {
-    ASSERT_EQ(orig_rgb24[i], dest_rgb24[i]);
+    EXPECT_EQ(orig_rgb24[i], dest_rgb24[i]);
  }

  free_aligned_buffer_page_end(orig_rgb24);
@ -2697,7 +2690,7 @@ TEST_F(LibYUVConvertTest, TestARGBToRGB565) {
  }
  ARGBToRGB565(&orig_pixels[0][0], 0, &dest_rgb565[0][0], 0, 256, 1);
  uint32_t checksum = HashDjb2(&dest_rgb565[0][0], sizeof(dest_rgb565), 5381);
-  ASSERT_EQ(610919429u, checksum);
+  EXPECT_EQ(610919429u, checksum);
 }

 TEST_F(LibYUVConvertTest, TestYUY2ToARGB) {
@ -2712,9 +2705,9 @@ TEST_F(LibYUVConvertTest, TestYUY2ToARGB) {
  YUY2ToARGB(&orig_pixels[0][0], 0, &dest_argb[0][0], 0, 256, 1);
  uint32_t checksum = HashDjb2(&dest_argb[0][0], sizeof(dest_argb), 5381);
 #if defined(LIBYUV_UNLIMITED_DATA)
-  ASSERT_EQ(10343289u, checksum);
+  EXPECT_EQ(10343289u, checksum);
 #else
-  ASSERT_EQ(3486643515u, checksum);
+  EXPECT_EQ(3486643515u, checksum);
 #endif
 }

@ -2730,9 +2723,9 @@ TEST_F(LibYUVConvertTest, TestUYVYToARGB) {
  UYVYToARGB(&orig_pixels[0][0], 0, &dest_argb[0][0], 0, 256, 1);
  uint32_t checksum = HashDjb2(&dest_argb[0][0], sizeof(dest_argb), 5381);
 #if defined(LIBYUV_UNLIMITED_DATA)
-  ASSERT_EQ(10343289u, checksum);
+  EXPECT_EQ(10343289u, checksum);
 #else
-  ASSERT_EQ(3486643515u, checksum);
+  EXPECT_EQ(3486643515u, checksum);
 #endif
 }

@ -2810,9 +2803,9 @@ TEST_F(LibYUVConvertTest, TestARGBToUVRow) {
  printf("\n");

  uint32_t checksum_u = HashDjb2(&dest_u[0], sizeof(dest_u), 5381);
-  ASSERT_EQ(192508756u, checksum_u);
+  EXPECT_EQ(192508756u, checksum_u);
  uint32_t checksum_v = HashDjb2(&dest_v[0], sizeof(dest_v), 5381);
-  ASSERT_EQ(2590663990u, checksum_v);
+  EXPECT_EQ(2590663990u, checksum_v);
 }
 #endif

@ -2838,23 +2831,16 @@ TEST_F(LibYUVConvertTest, TestARGBToUVMatrixRow_Opt) {
        memset(dest_v_c, 0, sizeof(dest_v_c));
        memset(dest_u_opt, 0, sizeof(dest_u_opt));
        memset(dest_v_opt, 0, sizeof(dest_v_opt));
-
+        
        int src_stride = (height == 1) ? 0 : kMaxWidth * 4;

-        ARGBToUVMatrixRow_C(&orig_argb_pixels[0], src_stride, &dest_u_c[0],
-                            &dest_v_c[0], width, &kArgbI601Constants);
-        ARGBToUVMatrixRow_Any_NEON(&orig_argb_pixels[0], src_stride,
-                                   &dest_u_opt[0], &dest_v_opt[0], width,
-                                   &kArgbI601Constants);
+        ARGBToUVMatrixRow_C(&orig_argb_pixels[0], src_stride, &dest_u_c[0], &dest_v_c[0], width, &kArgbI601Constants);
+        ARGBToUVMatrixRow_Any_NEON(&orig_argb_pixels[0], src_stride, &dest_u_opt[0], &dest_v_opt[0], width, &kArgbI601Constants);

        int half_width = (width + 1) / 2;
        for (int i = 0; i < half_width; ++i) {
-          ASSERT_EQ(dest_u_c[i], dest_u_opt[i])
-              << "u mismatch at " << i << " width " << width << " height "
-              << height;
-          ASSERT_EQ(dest_v_c[i], dest_v_opt[i])
-              << "v mismatch at " << i << " width " << width << " height "
-              << height;
+          EXPECT_EQ(dest_u_c[i], dest_u_opt[i]) << "u mismatch at " << i << " width " << width << " height " << height;
+          EXPECT_EQ(dest_v_c[i], dest_v_opt[i]) << "v mismatch at " << i << " width " << width << " height " << height;
        }
      }
    }
@ -2867,7 +2853,6 @@ TEST_F(LibYUVConvertTest, TestARGBToUVMatrixRow_Opt) {
    (defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__))
 // TODO(fbarchard): Consider _set_new_mode(0) to make malloc return NULL

-#ifndef DISABLE_SLOW_TESTS
 TEST_F(LibYUVConvertTest, TestI400LargeSize) {
  // The width and height are chosen as follows:
  // - kWidth * kHeight is not a multiple of 8: This lets us to use the Any
@ -2911,18 +2896,18 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
  for (int i = 0; i < kWidth * kHeight; ++i) {
    orig_i400[i] = i % 256;
  }
-  ASSERT_EQ(I400ToARGBMatrix(orig_i400, kStride, dest_argb, kWidth,
+  EXPECT_EQ(I400ToARGBMatrix(orig_i400, kStride, dest_argb, kWidth,
                             &kYuvJPEGConstants, kWidth, kHeight),
            0);
  free_aligned_buffer_page_end(dest_argb);
  free_aligned_buffer_page_end(orig_i400);
 }
-#endif  // DISABLE_SLOW_TESTS
 #endif  // !defined(DISABLE_SLOW_TESTS) && \
        // (defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__))

 #endif  // !defined(LEAN_TESTS)

+
 #define TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
                   SUBSAMP_Y, W1280, N, NEG, OFF)                              \
  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                             \
@ -2935,17 +2920,17 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2;                    \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
    align_buffer_page_end(src_argb,                                            \
-                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);    \
-    align_buffer_page_end(dst_y_c, kStrideY * kHeight);                        \
+                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
+    align_buffer_page_end(dst_y_c, kStrideY* kHeight);                         \
    align_buffer_page_end(dst_uv_c, kSizeUV);                                  \
-    align_buffer_page_end(dst_y_opt, kStrideY * kHeight);                      \
+    align_buffer_page_end(dst_y_opt, kStrideY* kHeight);                       \
    align_buffer_page_end(dst_uv_opt, kSizeUV);                                \
    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
      src_argb[i + OFF] = (fastrand() & 0xff);                                 \
    }                                                                          \
-    memset(dst_y_c, 1, kStrideY * kHeight);                                    \
+    memset(dst_y_c, 1, kStrideY* kHeight);                                     \
    memset(dst_uv_c, 2, kSizeUV);                                              \
-    memset(dst_y_opt, 101, kStrideY * kHeight);                                \
+    memset(dst_y_opt, 101, kStrideY* kHeight);                                 \
    memset(dst_uv_opt, 102, kSizeUV);                                          \
    MaskCpuFlags(disable_cpu_flags_);                                          \
    FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, dst_y_c, kStrideY,   \
@ -2956,10 +2941,10 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
                       kStrideY, dst_uv_opt, kStrideUV, kWidth, NEG kHeight);  \
    }                                                                          \
    for (int i = 0; i < kStrideY * kHeight; ++i) {                             \
-      ASSERT_EQ(dst_y_c[i], dst_y_opt[i]);                                     \
+      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                     \
    }                                                                          \
    for (int i = 0; i < kSizeUV; ++i) {                                        \
-      ASSERT_EQ(dst_uv_c[i], dst_uv_opt[i]);                                   \
+      EXPECT_EQ(dst_uv_c[i], dst_uv_opt[i]);                                   \
    }                                                                          \
    free_aligned_buffer_page_end(src_argb);                                    \
    free_aligned_buffer_page_end(dst_y_c);                                     \
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
--- a/unit_test/cpu_test.cc
+++ b/unit_test/cpu_test.cc
@ -48,7 +48,7 @@ TEST_F(LibYUVBaseTest, TestCpuId) {
    printf("Cpu Vendor: %s 0x%x 0x%x 0x%x\n",
           reinterpret_cast<char*>(&cpu_info[0]), cpu_info[0], cpu_info[1],
           cpu_info[2]);
-    ASSERT_EQ(12u, strlen(reinterpret_cast<char*>(&cpu_info[0])));
+    EXPECT_EQ(12u, strlen(reinterpret_cast<char*>(&cpu_info[0])));

    // CPU Family and Model
    // 3:0 - Stepping
@ -189,6 +189,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
    int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI);
    int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8);
    int has_amxint8 = TestCpuFlag(kCpuHasAMXINT8);
+    int has_avx512bmm = TestCpuFlag(kCpuHasAVX512BMM);
    printf("Has X86 0x%x\n", has_x86);
    printf("Has SSE2 0x%x\n", has_sse2);
    printf("Has SSSE3 0x%x\n", has_ssse3);
@ -211,6 +212,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
    printf("HAS AVXVNNI 0x%x\n", has_avxvnni);
    printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
    printf("Has AMXINT8 0x%x\n", has_amxint8);
+    printf("Has AVX512BMM 0x%x\n", has_avx512bmm);
  }
 #endif  // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) ||
        // defined(_M_X64)
@ -327,8 +329,8 @@ TEST_F(LibYUVBaseTest, DISABLED_TestLinuxArm) {
  if (FileExists("../../unit_test/testdata/arm_v7.txt")) {
    printf("Note: testing to load \"../../unit_test/testdata/arm_v7.txt\"\n");

-    ASSERT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt"));
-    ASSERT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt"));
+    EXPECT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt"));
+    EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt"));
  } else {
    printf("WARNING: unable to load \"../../unit_test/testdata/arm_v7.txt\"\n");
  }
@ -347,23 +349,23 @@ TEST_F(LibYUVBaseTest, DISABLED_TestLinuxArm) {
 #if defined(__linux__) && defined(__aarch64__)
 TEST_F(LibYUVBaseTest, TestLinuxAArch64) {
  // Values taken from a Cortex-A57 machine, only Neon available.
-  ASSERT_EQ(kCpuHasNEON, AArch64CpuCaps(0xffU, 0x0U));
+  EXPECT_EQ(kCpuHasNEON, AArch64CpuCaps(0xffU, 0x0U));

  // Values taken from a Google Pixel 7.
  int expected = kCpuHasNEON | kCpuHasNeonDotProd;
-  ASSERT_EQ(expected, AArch64CpuCaps(0x119fffU, 0x0U));
+  EXPECT_EQ(expected, AArch64CpuCaps(0x119fffU, 0x0U));

  // Values taken from a Google Pixel 8.
  expected = kCpuHasNEON | kCpuHasNeonDotProd | kCpuHasNeonI8MM | kCpuHasSVE |
             kCpuHasSVE2;
-  ASSERT_EQ(expected, AArch64CpuCaps(0x3fffffffU, 0x2f33fU));
+  EXPECT_EQ(expected, AArch64CpuCaps(0x3fffffffU, 0x2f33fU));

  // Values taken from a Neoverse N2 machine.
-  ASSERT_EQ(expected, AArch64CpuCaps(0x3fffffffU, 0x2f3ffU));
+  EXPECT_EQ(expected, AArch64CpuCaps(0x3fffffffU, 0x2f3ffU));

  // Check for SME feature detection.
  expected |= kCpuHasSME;
-  ASSERT_EQ(expected, AArch64CpuCaps(0x3fffffffU, 0x82f3ffU));
+  EXPECT_EQ(expected, AArch64CpuCaps(0x3fffffffU, 0x82f3ffU));

  // TODO: Check for SME2 feature detection from Apple M4
 }
@ -373,10 +375,10 @@ TEST_F(LibYUVBaseTest, DISABLED_TestLinuxRVV) {
  if (FileExists("../../unit_test/testdata/riscv64.txt")) {
    printf("Note: testing to load \"../../unit_test/testdata/riscv64.txt\"\n");

-    ASSERT_EQ(0, RiscvCpuCaps("../../unit_test/testdata/riscv64.txt"));
-    ASSERT_EQ(kCpuHasRVV,
+    EXPECT_EQ(0, RiscvCpuCaps("../../unit_test/testdata/riscv64.txt"));
+    EXPECT_EQ(kCpuHasRVV,
              RiscvCpuCaps("../../unit_test/testdata/riscv64_rvv.txt"));
-    ASSERT_EQ(kCpuHasRVV | kCpuHasRVVZVFH,
+    EXPECT_EQ(kCpuHasRVV | kCpuHasRVVZVFH,
              RiscvCpuCaps("../../unit_test/testdata/riscv64_rvv_zvfh.txt"));
  } else {
    printf(
@ -410,15 +412,15 @@ TEST_F(LibYUVBaseTest, MAYBE_TestSetCpuFlags) {
  // Test setting different CPU configurations.
  int cpu_flags = kCpuHasARM | kCpuHasNEON | kCpuInitialized;
  SetCpuFlags(cpu_flags);
-  ASSERT_EQ(cpu_flags, TestCpuFlag(-1));
+  EXPECT_EQ(cpu_flags, TestCpuFlag(-1));

  cpu_flags = kCpuHasX86 | kCpuInitialized;
  SetCpuFlags(cpu_flags);
-  ASSERT_EQ(cpu_flags, TestCpuFlag(-1));
+  EXPECT_EQ(cpu_flags, TestCpuFlag(-1));

  // Test that setting 0 turns auto-init back on.
  SetCpuFlags(0);
-  ASSERT_EQ(original_cpu_flags, TestCpuFlag(-1));
+  EXPECT_EQ(original_cpu_flags, TestCpuFlag(-1));

  // Restore the CPU flag mask.
  MaskCpuFlags(benchmark_cpu_info_);
--- a/unit_test/cpu_thread_test.cc
+++ b/unit_test/cpu_thread_test.cc
@ -51,10 +51,10 @@ TEST(LibYUVCpuThreadTest, TestCpuFlagMultipleThreads) {
  ret = pthread_create(&thread2, nullptr, ThreadMain, &cpu_flags2);
  ASSERT_EQ(ret, 0);
  ret = pthread_join(thread1, nullptr);
-  ASSERT_EQ(ret, 0);
+  EXPECT_EQ(ret, 0);
  ret = pthread_join(thread2, nullptr);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(cpu_flags1, cpu_flags2);
+  EXPECT_EQ(ret, 0);
+  EXPECT_EQ(cpu_flags1, cpu_flags2);
 #else
  printf("pthread unavailable; Test skipped.");
 #endif  // LIBYUV_HAVE_PTHREAD
--- a/unit_test/math_test.cc
+++ b/unit_test/math_test.cc
@ -30,44 +30,44 @@ TEST_F(LibYUVBaseTest, TestFixedDiv) {
  int result_opt[1280];
  int result_c[1280];

-  ASSERT_EQ(0x10000, libyuv::FixedDiv(1, 1));
-  ASSERT_EQ(0x7fff0000, libyuv::FixedDiv(0x7fff, 1));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(1, 1));
+  EXPECT_EQ(0x7fff0000, libyuv::FixedDiv(0x7fff, 1));
  // TODO(fbarchard): Avoid the following that throw exceptions.
-  // ASSERT_EQ(0x100000000, libyuv::FixedDiv(0x10000, 1));
-  // ASSERT_EQ(0x80000000, libyuv::FixedDiv(0x8000, 1));
+  // EXPECT_EQ(0x100000000, libyuv::FixedDiv(0x10000, 1));
+  // EXPECT_EQ(0x80000000, libyuv::FixedDiv(0x8000, 1));

-  ASSERT_EQ(0x20000, libyuv::FixedDiv(640 * 2, 640));
-  ASSERT_EQ(0x30000, libyuv::FixedDiv(640 * 3, 640));
-  ASSERT_EQ(0x40000, libyuv::FixedDiv(640 * 4, 640));
-  ASSERT_EQ(0x50000, libyuv::FixedDiv(640 * 5, 640));
-  ASSERT_EQ(0x60000, libyuv::FixedDiv(640 * 6, 640));
-  ASSERT_EQ(0x70000, libyuv::FixedDiv(640 * 7, 640));
-  ASSERT_EQ(0x80000, libyuv::FixedDiv(640 * 8, 640));
-  ASSERT_EQ(0xa0000, libyuv::FixedDiv(640 * 10, 640));
-  ASSERT_EQ(0x20000, libyuv::FixedDiv(960 * 2, 960));
-  ASSERT_EQ(0x08000, libyuv::FixedDiv(640 / 2, 640));
-  ASSERT_EQ(0x04000, libyuv::FixedDiv(640 / 4, 640));
-  ASSERT_EQ(0x20000, libyuv::FixedDiv(1080 * 2, 1080));
-  ASSERT_EQ(0x20000, libyuv::FixedDiv(200000, 100000));
-  ASSERT_EQ(0x18000, libyuv::FixedDiv(150000, 100000));
-  ASSERT_EQ(0x20000, libyuv::FixedDiv(40000, 20000));
-  ASSERT_EQ(0x20000, libyuv::FixedDiv(-40000, -20000));
-  ASSERT_EQ(-0x20000, libyuv::FixedDiv(40000, -20000));
-  ASSERT_EQ(-0x20000, libyuv::FixedDiv(-40000, 20000));
-  ASSERT_EQ(0x10000, libyuv::FixedDiv(4095, 4095));
-  ASSERT_EQ(0x10000, libyuv::FixedDiv(4096, 4096));
-  ASSERT_EQ(0x10000, libyuv::FixedDiv(4097, 4097));
-  ASSERT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(640 * 2, 640));
+  EXPECT_EQ(0x30000, libyuv::FixedDiv(640 * 3, 640));
+  EXPECT_EQ(0x40000, libyuv::FixedDiv(640 * 4, 640));
+  EXPECT_EQ(0x50000, libyuv::FixedDiv(640 * 5, 640));
+  EXPECT_EQ(0x60000, libyuv::FixedDiv(640 * 6, 640));
+  EXPECT_EQ(0x70000, libyuv::FixedDiv(640 * 7, 640));
+  EXPECT_EQ(0x80000, libyuv::FixedDiv(640 * 8, 640));
+  EXPECT_EQ(0xa0000, libyuv::FixedDiv(640 * 10, 640));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(960 * 2, 960));
+  EXPECT_EQ(0x08000, libyuv::FixedDiv(640 / 2, 640));
+  EXPECT_EQ(0x04000, libyuv::FixedDiv(640 / 4, 640));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(1080 * 2, 1080));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(200000, 100000));
+  EXPECT_EQ(0x18000, libyuv::FixedDiv(150000, 100000));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(40000, 20000));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(-40000, -20000));
+  EXPECT_EQ(-0x20000, libyuv::FixedDiv(40000, -20000));
+  EXPECT_EQ(-0x20000, libyuv::FixedDiv(-40000, 20000));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4095, 4095));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4096, 4096));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4097, 4097));
+  EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));

  for (int i = 1; i < 4100; ++i) {
-    ASSERT_EQ(0x10000, libyuv::FixedDiv(i, i));
-    ASSERT_EQ(0x20000, libyuv::FixedDiv(i * 2, i));
-    ASSERT_EQ(0x30000, libyuv::FixedDiv(i * 3, i));
-    ASSERT_EQ(0x40000, libyuv::FixedDiv(i * 4, i));
-    ASSERT_EQ(0x08000, libyuv::FixedDiv(i, i * 2));
-    ASSERT_NEAR(16384 * 65536 / i, libyuv::FixedDiv(16384, i), 1);
+    EXPECT_EQ(0x10000, libyuv::FixedDiv(i, i));
+    EXPECT_EQ(0x20000, libyuv::FixedDiv(i * 2, i));
+    EXPECT_EQ(0x30000, libyuv::FixedDiv(i * 3, i));
+    EXPECT_EQ(0x40000, libyuv::FixedDiv(i * 4, i));
+    EXPECT_EQ(0x08000, libyuv::FixedDiv(i, i * 2));
+    EXPECT_NEAR(16384 * 65536 / i, libyuv::FixedDiv(16384, i), 1);
  }
-  ASSERT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
+  EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));

  MemRandomize(reinterpret_cast<uint8_t*>(&num[0]), sizeof(num));
  MemRandomize(reinterpret_cast<uint8_t*>(&div[0]), sizeof(div));
@ -84,7 +84,7 @@ TEST_F(LibYUVBaseTest, TestFixedDiv) {
  }
  for (int j = 0; j < 1280; ++j) {
    result_c[j] = libyuv::FixedDiv_C(num[j], div[j]);
-    ASSERT_NEAR(result_c[j], result_opt[j], 1);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
  }
 }

@ -118,7 +118,7 @@ TEST_F(LibYUVBaseTest, TestFixedDiv_Opt) {
  }
  for (int j = 0; j < 1280; ++j) {
    result_c[j] = libyuv::FixedDiv_C(num[j], div[j]);
-    ASSERT_NEAR(result_c[j], result_opt[j], 1);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
  }
 }

@ -152,7 +152,7 @@ TEST_F(LibYUVBaseTest, TestFixedDiv1_Opt) {
  }
  for (int j = 0; j < 1280; ++j) {
    result_c[j] = libyuv::FixedDiv1_C(num[j], div[j]);
-    ASSERT_NEAR(result_c[j], result_opt[j], 1);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
  }
 }
 #endif  // ENABLE_ROW_TESTS
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
--- a/unit_test/rotate_argb_test.cc
+++ b/unit_test/rotate_argb_test.cc
@ -75,7 +75,7 @@ static void TestRotateBpp(int src_width,

  // Rotation should be exact.
  for (int i = 0; i < dst_argb_plane_size; ++i) {
-    ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);
+    EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);
  }

  free_aligned_buffer_page_end(dst_argb_c);
@ -189,35 +189,35 @@ TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) {
  align_buffer_page_end(src_argb, argb_plane_size);
  align_buffer_page_end(dst_argb, argb_plane_size);

-  ASSERT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
                          benchmark_width_ * 4, benchmark_width_,
                          benchmark_height_, kRotate0));

-  ASSERT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
                          benchmark_width_ * 4 - 1, benchmark_width_ - 1,
                          benchmark_height_, kRotate0));

-  ASSERT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
                          benchmark_width_ * 4, benchmark_width_,
                          benchmark_height_, kRotate180));

-  ASSERT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
                          benchmark_width_ * 4 - 1, benchmark_width_ - 1,
                          benchmark_height_, kRotate180));

-  ASSERT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
                          abs(benchmark_height_) * 4, benchmark_width_,
                          benchmark_height_, kRotate90));

-  ASSERT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+  EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
                           abs(benchmark_height_) * 4, benchmark_width_ - 1,
                           benchmark_height_, kRotate90));

-  ASSERT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
                          abs(benchmark_height_) * 4, benchmark_width_,
                          benchmark_height_, kRotate270));

-  ASSERT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+  EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
                           abs(benchmark_height_) * 4, benchmark_width_ - 1,
                           benchmark_height_, kRotate270));

@ -271,7 +271,7 @@ static void TestRotatePlane_16(int src_width,

  // Rotation should be exact.
  for (int i = 0; i < dst_plane_size; ++i) {
-    ASSERT_EQ(dst_c[i], dst_opt[i]);
+    EXPECT_EQ(dst_c[i], dst_opt[i]);
  }

  free_aligned_buffer_page_end_16(dst_c);
--- a/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
@ -20,7 +20,7 @@

 namespace libyuv {

-#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))

 static void I420TestRotate(int src_width,
                           int src_height,
@ -78,7 +78,7 @@ static void I420TestRotate(int src_width,

  // Rotation should be exact.
  for (int i = 0; i < dst_i420_size; ++i) {
-    ASSERT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+    EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
  }

  free_aligned_buffer_page_end(dst_i420_c);
@ -197,7 +197,7 @@ static void I422TestRotate(int src_width,

  // Rotation should be exact.
  for (int i = 0; i < dst_i422_size; ++i) {
-    ASSERT_EQ(dst_i422_c[i], dst_i422_opt[i]);
+    EXPECT_EQ(dst_i422_c[i], dst_i422_opt[i]);
  }

  free_aligned_buffer_page_end(dst_i422_c);
@ -283,7 +283,7 @@ static void I444TestRotate(int src_width,

  // Rotation should be exact.
  for (int i = 0; i < dst_i444_size; ++i) {
-    ASSERT_EQ(dst_i444_c[i], dst_i444_opt[i]);
+    EXPECT_EQ(dst_i444_c[i], dst_i444_opt[i]);
  }

  free_aligned_buffer_page_end(dst_i444_c);
@ -401,7 +401,7 @@ static void NV12TestRotate(int src_width,

  // Rotation should be exact.
  for (int i = 0; i < dst_i420_size; ++i) {
-    ASSERT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+    EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
  }

  free_aligned_buffer_page_end(dst_i420_c);
@ -495,15 +495,15 @@ TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
    const int kHeight = benchmark_height_;                                    \
    const int kSizeUV =                                                       \
        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
-    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
    align_buffer_page_end(src_uv,                                             \
-                          kSizeUV * ((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);     \
-    align_buffer_page_end(dst_y_c, kWidth * kHeight);                         \
+                          kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);       \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
    align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
    align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                       \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
    align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
    align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
@ -522,12 +522,12 @@ TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
            (fastrand() & 0xff);                                              \
      }                                                                       \
    }                                                                         \
-    memset(dst_y_c, 1, kWidth * kHeight);                                     \
+    memset(dst_y_c, 1, kWidth* kHeight);                                      \
    memset(dst_u_c, 2,                                                        \
           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
    memset(dst_v_c, 3,                                                        \
           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    memset(dst_y_opt, 101, kWidth * kHeight);                                 \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
    memset(dst_u_opt, 102,                                                    \
           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
    memset(dst_v_opt, 103,                                                    \
@ -550,18 +550,18 @@ TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
    }                                                                         \
    for (int i = 0; i < kHeight; ++i) {                                       \
      for (int j = 0; j < kWidth; ++j) {                                      \
-        ASSERT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
+        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
      }                                                                       \
    }                                                                         \
    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
-        ASSERT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
+        EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
                  dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
      }                                                                       \
    }                                                                         \
    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
-        ASSERT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
+        EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],              \
                  dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);           \
      }                                                                       \
    }                                                                         \
@ -656,7 +656,7 @@ static void I010TestRotate(int src_width,

  // Rotation should be exact.
  for (int i = 0; i < dst_i010_size; ++i) {
-    ASSERT_EQ(dst_i010_c[i], dst_i010_opt[i]);
+    EXPECT_EQ(dst_i010_c[i], dst_i010_opt[i]);
  }

  free_aligned_buffer_page_end_16(dst_i010_c);
@ -744,7 +744,7 @@ static void I210TestRotate(int src_width,

  // Rotation should be exact.
  for (int i = 0; i < dst_i210_size; ++i) {
-    ASSERT_EQ(dst_i210_c[i], dst_i210_opt[i]);
+    EXPECT_EQ(dst_i210_c[i], dst_i210_opt[i]);
  }

  free_aligned_buffer_page_end_16(dst_i210_c);
@ -830,7 +830,7 @@ static void I410TestRotate(int src_width,

  // Rotation should be exact.
  for (int i = 0; i < dst_i410_size; ++i) {
-    ASSERT_EQ(dst_i410_c[i], dst_i410_opt[i]);
+    EXPECT_EQ(dst_i410_c[i], dst_i410_opt[i]);
  }

  free_aligned_buffer_page_end_16(dst_i410_c);
@ -906,8 +906,8 @@ TEST_F(LibYUVRotateTest, Transpose4x4_Test) {

  for (int i = 0; i < 4; ++i) {
    for (int j = 0; j < 4; ++j) {
-      ASSERT_EQ(dst_pixels_c[i][j], src_pixels[j][i]);
-      ASSERT_EQ(dst_pixels_c[i][j], dst_pixels_opt[i][j]);
+      EXPECT_EQ(dst_pixels_c[i][j], src_pixels[j][i]);
+      EXPECT_EQ(dst_pixels_c[i][j], dst_pixels_opt[i][j]);
    }
  }
 }
@ -949,7 +949,7 @@ TEST_F(LibYUVRotateTest, Transpose4x4_Opt) {
  }

  for (int i = 0; i < width * height; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }

  free_aligned_buffer_page_end(src_pixels);
--- a/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@ -245,14 +245,14 @@ static int ARGBClipTestFilter(int src_width,
        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
        benchmark_cpu_info_);                                                \
-    ASSERT_LE(diff, max_diff);                                               \
+    EXPECT_LE(diff, max_diff);                                               \
  }                                                                          \
  TEST_F(LibYUVScaleTest, DISABLED_##ARGBScaleDownClipBy##name##_##filter) { \
    int diff = ARGBClipTestFilter(                                           \
        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
        kFilter##filter, benchmark_iterations_);                             \
-    ASSERT_LE(diff, max_diff);                                               \
+    EXPECT_LE(diff, max_diff);                                               \
  }

 // Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
@ -294,28 +294,28 @@ TEST_FACTOR(3, 1, 3)
    int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, width,      \
                              height, kFilter##filter, benchmark_iterations_,  \
                              disable_cpu_flags_, benchmark_cpu_info_);        \
-    ASSERT_LE(diff, max_diff);                                                 \
+    EXPECT_LE(diff, max_diff);                                                 \
  }                                                                            \
  TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {           \
    int diff = ARGBTestFilter(width, height, Abs(benchmark_width_),            \
                              Abs(benchmark_height_), kFilter##filter,         \
                              benchmark_iterations_, disable_cpu_flags_,       \
                              benchmark_cpu_info_);                            \
-    ASSERT_LE(diff, max_diff);                                                 \
+    EXPECT_LE(diff, max_diff);                                                 \
  }                                                                            \
  TEST_F(LibYUVScaleTest,                                                      \
         DISABLED_##name##ClipTo##width##x##height##_##filter) {               \
    int diff =                                                                 \
        ARGBClipTestFilter(benchmark_width_, benchmark_height_, width, height, \
                           kFilter##filter, benchmark_iterations_);            \
-    ASSERT_LE(diff, max_diff);                                                 \
+    EXPECT_LE(diff, max_diff);                                                 \
  }                                                                            \
  TEST_F(LibYUVScaleTest,                                                      \
         DISABLED_##name##ClipFrom##width##x##height##_##filter) {             \
    int diff = ARGBClipTestFilter(width, height, Abs(benchmark_width_),        \
                                  Abs(benchmark_height_), kFilter##filter,     \
                                  benchmark_iterations_);                      \
-    ASSERT_LE(diff, max_diff);                                                 \
+    EXPECT_LE(diff, max_diff);                                                 \
  }

 #ifndef DISABLE_SLOW_TESTS
@ -357,7 +357,7 @@ TEST_SCALETO(ARGBScale, 1920, 1080)
                              benchmark_height_, benchmark_width_,      \
                              kFilter##filter, benchmark_iterations_,   \
                              disable_cpu_flags_, benchmark_cpu_info_); \
-    ASSERT_LE(diff, max_diff);                                          \
+    EXPECT_LE(diff, max_diff);                                          \
  }

 #if defined(ENABLE_FULL_TESTS)
@ -430,14 +430,12 @@ static void FillRamp(uint8_t* buf,
 }

 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
-static void YUVToARGBTestFilter(int src_width,
-                                int src_height,
-                                int dst_width,
-                                int dst_height,
-                                FilterMode f,
-                                int benchmark_iterations,
-                                int error_threshold,
-                                int* max_diff_out) {
+static int YUVToARGBTestFilter(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               FilterMode f,
+                               int benchmark_iterations) {
  int64_t src_y_plane_size = Abs(src_width) * Abs(src_height);
  int64_t src_uv_plane_size =
      ((Abs(src_width) + 1) / 2) * ((Abs(src_height) + 1) / 2);
@ -448,13 +446,13 @@ static void YUVToARGBTestFilter(int src_width,
  align_buffer_page_end(src_u, src_uv_plane_size);
  align_buffer_page_end(src_v, src_uv_plane_size);

-  int64_t dst_argb_plane_size = (dst_width) * (dst_height) * 4LL;
-  int dst_stride_argb = (dst_width) * 4;
+  int64_t dst_argb_plane_size = (dst_width) * (dst_height)*4LL;
+  int dst_stride_argb = (dst_width)*4;
  align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
  align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
  if (!dst_argb_c || !dst_argb_opt || !src_y || !src_u || !src_v) {
    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
-    return;
+    return 0;
  }
  // Fill YUV image with continuous ramp, which is less sensitive to
  // subsampling and filtering differences for test purposes.
@ -483,44 +481,36 @@ static void YUVToARGBTestFilter(int src_width,
      int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
                         dst_argb_opt[(i * dst_stride_argb) + j]);
      if (abs_diff > max_diff) {
-        max_diff = abs_diff;
-      }
-      if (abs_diff > error_threshold) {
-        printf("error %d at %d,%d c %d opt %d\n", abs_diff, j, i,
+        printf("error %d at %d,%d c %d opt %d", abs_diff, j, i,
               dst_argb_c[(i * dst_stride_argb) + j],
               dst_argb_opt[(i * dst_stride_argb) + j]);
-        goto cleanup;
+        EXPECT_LE(abs_diff, 40);
+        max_diff = abs_diff;
      }
    }
  }

-cleanup:
-  if (max_diff_out) {
-    *max_diff_out = max_diff;
-  }
  free_aligned_buffer_page_end(dst_argb_c);
  free_aligned_buffer_page_end(dst_argb_opt);
  free_aligned_buffer_page_end(src_y);
  free_aligned_buffer_page_end(src_u);
  free_aligned_buffer_page_end(src_v);
+  return max_diff;
 }

 TEST_F(LibYUVScaleTest, YUVToRGBScaleUp) {
-  int diff = 0;
-  YUVToARGBTestFilter(benchmark_width_, benchmark_height_,
-                      benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2,
-                      libyuv::kFilterBilinear, benchmark_iterations_, 10,
-                      &diff);
-  ASSERT_LE(diff, 10);
+  int diff =
+      YUVToARGBTestFilter(benchmark_width_, benchmark_height_,
+                          benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2,
+                          libyuv::kFilterBilinear, benchmark_iterations_);
+  EXPECT_LE(diff, 10);
 }

 TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
-  int diff = 0;
-  YUVToARGBTestFilter(benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2,
-                      benchmark_width_, benchmark_height_,
-                      libyuv::kFilterBilinear, benchmark_iterations_, 10,
-                      &diff);
-  ASSERT_LE(diff, 10);
+  int diff = YUVToARGBTestFilter(
+      benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2, benchmark_width_,
+      benchmark_height_, libyuv::kFilterBilinear, benchmark_iterations_);
+  EXPECT_LE(diff, 10);
 }

 TEST_F(LibYUVScaleTest, ARGBTest3x) {
@ -543,18 +533,18 @@ TEST_F(LibYUVScaleTest, ARGBTest3x) {
              kFilterBilinear);
  }

-  ASSERT_EQ(225, dest_pixels[0]);
-  ASSERT_EQ(255 - 225, dest_pixels[1]);
-  ASSERT_EQ(226, dest_pixels[2]);
-  ASSERT_EQ(235, dest_pixels[3]);
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);
+  EXPECT_EQ(226, dest_pixels[2]);
+  EXPECT_EQ(235, dest_pixels[3]);

  ARGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
            kFilterNone);

-  ASSERT_EQ(225, dest_pixels[0]);
-  ASSERT_EQ(255 - 225, dest_pixels[1]);
-  ASSERT_EQ(226, dest_pixels[2]);
-  ASSERT_EQ(235, dest_pixels[3]);
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);
+  EXPECT_EQ(226, dest_pixels[2]);
+  EXPECT_EQ(235, dest_pixels[3]);

  free_aligned_buffer_page_end(dest_pixels);
  free_aligned_buffer_page_end(orig_pixels);
@ -580,18 +570,18 @@ TEST_F(LibYUVScaleTest, ARGBTest4x) {
              kFilterBilinear);
  }

-  ASSERT_NEAR(66, dest_pixels[0], 4);
-  ASSERT_NEAR(255 - 66, dest_pixels[1], 4);
-  ASSERT_NEAR(67, dest_pixels[2], 4);
-  ASSERT_NEAR(76, dest_pixels[3], 4);
+  EXPECT_NEAR(66, dest_pixels[0], 4);
+  EXPECT_NEAR(255 - 66, dest_pixels[1], 4);
+  EXPECT_NEAR(67, dest_pixels[2], 4);
+  EXPECT_NEAR(76, dest_pixels[3], 4);

  ARGBScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
            kFilterNone);

-  ASSERT_EQ(2, dest_pixels[0]);
-  ASSERT_EQ(255 - 2, dest_pixels[1]);
-  ASSERT_EQ(3, dest_pixels[2]);
-  ASSERT_EQ(12, dest_pixels[3]);
+  EXPECT_EQ(2, dest_pixels[0]);
+  EXPECT_EQ(255 - 2, dest_pixels[1]);
+  EXPECT_EQ(3, dest_pixels[2]);
+  EXPECT_EQ(12, dest_pixels[3]);

  free_aligned_buffer_page_end(dest_pixels);
  free_aligned_buffer_page_end(orig_pixels);
--- a/unit_test/scale_plane_test.cc
+++ b/unit_test/scale_plane_test.cc
@ -42,108 +42,6 @@

 namespace libyuv {

-// POC: int row_stride = src_stride * 2 overflows to a small negative value
-// when src_stride is close to INT_MAX, causing src_ptr to walk backward
-// past the start of the source allocation on the second loop iteration.
-// With src_stride = 0x7FFFFFFE, row_stride = (int)0xFFFFFFFC = -4, so on
-// y=1 ScaleRowDown2Box reads 4 bytes before the heap allocation.
-TEST_F(LibYUVScaleTest, ScalePlaneDown2_RowStrideOverflow) {
-  constexpr int kSrcStride = 0x7FFFFFFE;  // INT_MAX - 1
-  constexpr int kSrcW = 64;
-  constexpr int kSrcH = 4;
-  constexpr int kDstW = 32;
-  constexpr int kDstH = 2;
-  // src_size = (kSrcH - 1) * stride + width.
-  size_t src_size = kSrcH - 1;
-  if (src_size > SIZE_MAX / kSrcStride) {
-    GTEST_SKIP() << "could not represent allocation size in size_t";
-  }
-  src_size *= kSrcStride;
-  if (src_size > SIZE_MAX - kSrcW) {
-    GTEST_SKIP() << "could not represent allocation size in size_t";
-  }
-  src_size += kSrcW;
-
-#if defined(__aarch64__)
-  // Infer malloc can accept a large size for cpu with dot product (a76/a55)
-  int has_large_malloc = TestCpuFlag(kCpuHasNeonDotProd);
-#else
-  int has_large_malloc = 1;
-#endif
-  if (!has_large_malloc) {
-    GTEST_SKIP() << "large allocation may assert for " << src_size << " bytes";
-  }
-
-  uint8_t* src = new (std::nothrow) uint8_t[src_size];
-  if (!src) {
-    GTEST_SKIP() << "could not allocate " << src_size << " bytes";
-  }
-  uint8_t dst[kDstW * kDstH];
-  uint8_t* src_row = src;
-  for (int i = 0; i < kSrcH; i++) {
-    memset(src_row, 0x41, kSrcW);
-    src_row += kSrcStride;
-  }
-  // Force the C row kernel: the SIMD kernels are inline asm that ASAN does not
-  // instrument, so they silently read OOB without a report.
-  MaskCpuFlags(1);
-  // 2*dst == src on both axes -> ScalePlane dispatches to ScalePlaneDown2.
-  // int row_stride = kSrcStride * 2 wraps to -4; on y=1 src_ptr underflows.
-  ScalePlane(src, kSrcStride, kSrcW, kSrcH, dst, kDstW, kDstW, kDstH,
-             kFilterBox);
-  MaskCpuFlags(0);
-  delete[] src;
-}
-
-// POC: same defect in the 1/4 fast path. src_stride = 0x3FFFFFFF gives
-// int row_stride = src_stride * 4 = (int)0xFFFFFFFC = -4.
-TEST_F(LibYUVScaleTest, ScalePlaneDown4_RowStrideOverflow) {
-  constexpr int kSrcStride = 0x3FFFFFFF;  // INT_MAX / 4 (rounded down)
-  constexpr int kSrcW = 64;
-  constexpr int kSrcH = 8;
-  constexpr int kDstW = 16;
-  constexpr int kDstH = 2;
-  // src_size = (kSrcH - 1) * stride + width.
-  size_t src_size = kSrcH - 1;
-  if (src_size > SIZE_MAX / kSrcStride) {
-    GTEST_SKIP() << "could not represent allocation size in size_t";
-  }
-  src_size *= kSrcStride;
-  if (src_size > SIZE_MAX - kSrcW) {
-    GTEST_SKIP() << "could not represent allocation size in size_t";
-  }
-  src_size += kSrcW;
-
-#if defined(__aarch64__)
-  // Infer malloc can accept a large size for cpu with dot product (a76/a55)
-  int has_large_malloc = TestCpuFlag(kCpuHasNeonDotProd);
-#else
-  int has_large_malloc = 1;
-#endif
-  if (!has_large_malloc) {
-    GTEST_SKIP() << "large allocation may assert for " << src_size << " bytes";
-  }
-
-  uint8_t* src = new (std::nothrow) uint8_t[src_size];
-  if (!src) {
-    GTEST_SKIP() << "could not allocate " << src_size << " bytes";
-  }
-  uint8_t dst[kDstW * kDstH];
-  uint8_t* src_row = src;
-  for (int i = 0; i < kSrcH; i++) {
-    memset(src_row, 0x41, kSrcW);
-    src_row += kSrcStride;
-  }
-  // Force the C row kernel: the SIMD kernels are inline asm that ASAN does not
-  // instrument, so they silently read OOB without a report.
-  MaskCpuFlags(1);
-  // 4*dst == src on both axes with kFilterBox -> ScalePlaneDown4.
-  ScalePlane(src, kSrcStride, kSrcW, kSrcH, dst, kDstW, kDstW, kDstH,
-             kFilterBox);
-  MaskCpuFlags(0);
-  delete[] src;
-}
-
 #ifdef ENABLE_ROW_TESTS
 #ifdef HAS_SCALEROWDOWN2_SSSE3
 TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
@ -187,49 +85,49 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
    // Test regular half size.
    ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64);

-    ASSERT_EQ(64u, dst_pixels_c[0]);
-    ASSERT_EQ(25u, dst_pixels_c[1]);
-    ASSERT_EQ(13u, dst_pixels_c[2]);
-    ASSERT_EQ(5u, dst_pixels_c[3]);
-    ASSERT_EQ(0u, dst_pixels_c[4]);
-    ASSERT_EQ(133u, dst_pixels_c[63]);
+    EXPECT_EQ(64u, dst_pixels_c[0]);
+    EXPECT_EQ(25u, dst_pixels_c[1]);
+    EXPECT_EQ(13u, dst_pixels_c[2]);
+    EXPECT_EQ(5u, dst_pixels_c[3]);
+    EXPECT_EQ(0u, dst_pixels_c[4]);
+    EXPECT_EQ(133u, dst_pixels_c[63]);

    // Test Odd width version - Last pixel is just 1 horizontal pixel.
    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);

-    ASSERT_EQ(64u, dst_pixels_c[0]);
-    ASSERT_EQ(25u, dst_pixels_c[1]);
-    ASSERT_EQ(13u, dst_pixels_c[2]);
-    ASSERT_EQ(5u, dst_pixels_c[3]);
-    ASSERT_EQ(0u, dst_pixels_c[4]);
-    ASSERT_EQ(10u, dst_pixels_c[63]);
+    EXPECT_EQ(64u, dst_pixels_c[0]);
+    EXPECT_EQ(25u, dst_pixels_c[1]);
+    EXPECT_EQ(13u, dst_pixels_c[2]);
+    EXPECT_EQ(5u, dst_pixels_c[3]);
+    EXPECT_EQ(0u, dst_pixels_c[4]);
+    EXPECT_EQ(10u, dst_pixels_c[63]);

    // Test one pixel less, should skip the last pixel.
    memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63);

-    ASSERT_EQ(64u, dst_pixels_c[0]);
-    ASSERT_EQ(25u, dst_pixels_c[1]);
-    ASSERT_EQ(13u, dst_pixels_c[2]);
-    ASSERT_EQ(5u, dst_pixels_c[3]);
-    ASSERT_EQ(0u, dst_pixels_c[4]);
-    ASSERT_EQ(0u, dst_pixels_c[63]);
+    EXPECT_EQ(64u, dst_pixels_c[0]);
+    EXPECT_EQ(25u, dst_pixels_c[1]);
+    EXPECT_EQ(13u, dst_pixels_c[2]);
+    EXPECT_EQ(5u, dst_pixels_c[3]);
+    EXPECT_EQ(0u, dst_pixels_c[4]);
+    EXPECT_EQ(0u, dst_pixels_c[63]);

    // Test regular half size SSSE3.
    ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);

-    ASSERT_EQ(64u, dst_pixels_opt[0]);
-    ASSERT_EQ(25u, dst_pixels_opt[1]);
-    ASSERT_EQ(13u, dst_pixels_opt[2]);
-    ASSERT_EQ(5u, dst_pixels_opt[3]);
-    ASSERT_EQ(0u, dst_pixels_opt[4]);
-    ASSERT_EQ(133u, dst_pixels_opt[63]);
+    EXPECT_EQ(64u, dst_pixels_opt[0]);
+    EXPECT_EQ(25u, dst_pixels_opt[1]);
+    EXPECT_EQ(13u, dst_pixels_opt[2]);
+    EXPECT_EQ(5u, dst_pixels_opt[3]);
+    EXPECT_EQ(0u, dst_pixels_opt[4]);
+    EXPECT_EQ(133u, dst_pixels_opt[63]);

    // Compare C and SSSE3 match.
    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
    ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
    for (int i = 0; i < 64; ++i) {
-      ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
    }
  }
 }
@ -262,11 +160,11 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
  }

  for (int i = 0; i < 1280; ++i) {
-    ASSERT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }

-  ASSERT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4);
-  ASSERT_EQ(dst_pixels_c[1279], 3839);
+  EXPECT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4);
+  EXPECT_EQ(dst_pixels_c[1279], 3839);
 }
 #endif  // ENABLE_ROW_TESTS

@ -346,7 +244,7 @@ static int TestPlaneFilter_16(int src_width,
        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),   \
        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,            \
        benchmark_cpu_info_);                                                  \
-    ASSERT_LE(diff, max_diff);                                                 \
+    EXPECT_LE(diff, max_diff);                                                 \
  }

 // Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
@ -385,12 +283,12 @@ TEST_F(LibYUVScaleTest, PlaneTest3x) {
               kFilterBilinear);
  }

-  ASSERT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(225, dest_pixels[0]);

  ScalePlane(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
             kFilterNone);

-  ASSERT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(225, dest_pixels[0]);

  free_aligned_buffer_page_end(dest_pixels);
  free_aligned_buffer_page_end(orig_pixels);
@ -413,12 +311,12 @@ TEST_F(LibYUVScaleTest, PlaneTest4x) {
               kFilterBilinear);
  }

-  ASSERT_EQ(66, dest_pixels[0]);
+  EXPECT_EQ(66, dest_pixels[0]);

  ScalePlane(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1,
             kFilterNone);

-  ASSERT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
+  EXPECT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row

  free_aligned_buffer_page_end(dest_pixels);
  free_aligned_buffer_page_end(orig_pixels);
@ -447,7 +345,7 @@ TEST_F(LibYUVScaleTest, PlaneTestRotate_None) {
  }

  for (int i = 0; i < kSize; ++i) {
-    ASSERT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
  }

  free_aligned_buffer_page_end(dest_c_pixels);
@ -477,7 +375,7 @@ TEST_F(LibYUVScaleTest, PlaneTestRotate_Bilinear) {
  }

  for (int i = 0; i < kSize; ++i) {
-    ASSERT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
  }

  free_aligned_buffer_page_end(dest_c_pixels);
@ -508,7 +406,7 @@ TEST_F(LibYUVScaleTest, PlaneTestRotate_Box) {
  }

  for (int i = 0; i < kSize; ++i) {
-    ASSERT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
+    EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]);
  }

  free_aligned_buffer_page_end(dest_c_pixels);
@ -534,9 +432,9 @@ TEST_F(LibYUVScaleTest, PlaneTest1_Box) {
                     /* dst_width= */ 1, /* dst_height= */ 2,
                     libyuv::kFilterBox);

-  ASSERT_EQ(dst_pixels[0], 1);
-  ASSERT_EQ(dst_pixels[1], 1);
-  ASSERT_EQ(dst_pixels[2], 3);
+  EXPECT_EQ(dst_pixels[0], 1);
+  EXPECT_EQ(dst_pixels[1], 1);
+  EXPECT_EQ(dst_pixels[2], 3);

  free_aligned_buffer_page_end(dst_pixels);
  free_aligned_buffer_page_end(orig_pixels);
@ -562,9 +460,9 @@ TEST_F(LibYUVScaleTest, PlaneTest1_16_Box) {
      /* src_height= */ 1, dst_pixels, /* dst_stride= */ 1,
      /* dst_width= */ 1, /* dst_height= */ 2, libyuv::kFilterNone);

-  ASSERT_EQ(dst_pixels[0], 1);
-  ASSERT_EQ(dst_pixels[1], 1);
-  ASSERT_EQ(dst_pixels[2], 3);
+  EXPECT_EQ(dst_pixels[0], 1);
+  EXPECT_EQ(dst_pixels[1], 1);
+  EXPECT_EQ(dst_pixels[2], 3);

  free_aligned_buffer_page_end(dst_pixels_alloc);
  free_aligned_buffer_page_end(orig_pixels_alloc);
@ -631,58 +529,9 @@ TEST_F(LibYUVScaleTest, ScalePlaneVertical_IntStrideOverflow) {
                     kDstHeight, kFilterNone);

  // Not reached under ASAN.
-  ASSERT_EQ(0, r);
+  EXPECT_EQ(0, r);
  delete[] src;
  delete[] dst;
 }

-TEST_F(LibYUVScaleTest, ScalePlane_InvalidInputs) {
-  uint8_t src[16] = {0};
-  uint8_t dst[16] = {0};
-
-  // NULL src/dst
-  EXPECT_EQ(-1, ScalePlane(nullptr, 4, 4, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, 4, nullptr, 4, 4, 4, kFilterNone));
-
-  // Width/height <= 0 (except src_height which can be negative but not 0)
-  EXPECT_EQ(-1, ScalePlane(src, 4, 0, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, -1, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, 0, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, 4, dst, 4, 0, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, 4, dst, 4, -1, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, 4, dst, 4, 4, 0, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, 4, dst, 4, 4, -1, kFilterNone));
-
-  // Width/height too large (> 32768)
-  EXPECT_EQ(-1, ScalePlane(src, 4, 32769, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, 32769, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane(src, 4, 4, -32769, dst, 4, 4, 4, kFilterNone));
-
-  // Valid edge cases
-  EXPECT_EQ(0, ScalePlane(src, 4, 1, 1, dst, 4, 1, 1, kFilterNone));
-  EXPECT_EQ(0, ScalePlane(src, 4, 1, -1, dst, 4, 1, 1, kFilterNone));
-}
-
-TEST_F(LibYUVScaleTest, ScalePlane_16_InvalidInputs) {
-  uint16_t src[16] = {0};
-  uint16_t dst[16] = {0};
-
-  EXPECT_EQ(-1, ScalePlane_16(nullptr, 4, 4, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_16(src, 4, 4, 4, nullptr, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_16(src, 4, 0, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_16(src, 4, 32769, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_16(src, 4, 4, -32769, dst, 4, 4, 4, kFilterNone));
-}
-
-TEST_F(LibYUVScaleTest, ScalePlane_12_InvalidInputs) {
-  uint16_t src[16] = {0};
-  uint16_t dst[16] = {0};
-
-  EXPECT_EQ(-1, ScalePlane_12(nullptr, 4, 4, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_12(src, 4, 4, 4, nullptr, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_12(src, 4, 0, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_12(src, 4, 32769, 4, dst, 4, 4, 4, kFilterNone));
-  EXPECT_EQ(-1, ScalePlane_12(src, 4, 4, -32769, dst, 4, 4, 4, kFilterNone));
-}
-
 }  // namespace libyuv
--- a/unit_test/scale_rgb_test.cc
+++ b/unit_test/scale_rgb_test.cc
@ -128,7 +128,7 @@ static int RGBTestFilter(int src_width,
        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
        benchmark_cpu_info_);                                                \
-    ASSERT_LE(diff, max_diff);                                               \
+    EXPECT_LE(diff, max_diff);                                               \
  }

 #if defined(ENABLE_FULL_TESTS)
@ -163,14 +163,14 @@ TEST_FACTOR(3, 1, 3)
    int diff = RGBTestFilter(benchmark_width_, benchmark_height_, width,     \
                             height, kFilter##filter, benchmark_iterations_, \
                             disable_cpu_flags_, benchmark_cpu_info_);       \
-    ASSERT_LE(diff, max_diff);                                               \
+    EXPECT_LE(diff, max_diff);                                               \
  }                                                                          \
  TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {         \
    int diff = RGBTestFilter(width, height, Abs(benchmark_width_),           \
                             Abs(benchmark_height_), kFilter##filter,        \
                             benchmark_iterations_, disable_cpu_flags_,      \
                             benchmark_cpu_info_);                           \
-    ASSERT_LE(diff, max_diff);                                               \
+    EXPECT_LE(diff, max_diff);                                               \
  }

 #if defined(ENABLE_FULL_TESTS)
@ -202,7 +202,7 @@ TEST_SCALETO(RGBScale, 1920, 1080)
                             benchmark_height_, benchmark_width_,      \
                             kFilter##filter, benchmark_iterations_,   \
                             disable_cpu_flags_, benchmark_cpu_info_); \
-    ASSERT_LE(diff, max_diff);                                         \
+    EXPECT_LE(diff, max_diff);                                         \
  }

 #if defined(ENABLE_FULL_TESTS)
@ -233,14 +233,14 @@ TEST_F(LibYUVScaleTest, RGBTest3x) {
             kFilterBilinear);
  }

-  ASSERT_EQ(225, dest_pixels[0]);
-  ASSERT_EQ(255 - 225, dest_pixels[1]);
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);

  RGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
           kFilterNone);

-  ASSERT_EQ(225, dest_pixels[0]);
-  ASSERT_EQ(255 - 225, dest_pixels[1]);
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);

  free_aligned_buffer_page_end(dest_pixels);
  free_aligned_buffer_page_end(orig_pixels);
@ -264,14 +264,14 @@ TEST_F(LibYUVScaleTest, RGBTest4x) {
             kFilterBilinear);
  }

-  ASSERT_EQ(66, dest_pixels[0]);
-  ASSERT_EQ(190, dest_pixels[1]);
+  EXPECT_EQ(66, dest_pixels[0]);
+  EXPECT_EQ(190, dest_pixels[1]);

  RGBScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
           kFilterNone);

-  ASSERT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
-  ASSERT_EQ(255 - 2, dest_pixels[1]);
+  EXPECT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
+  EXPECT_EQ(255 - 2, dest_pixels[1]);

  free_aligned_buffer_page_end(dest_pixels);
  free_aligned_buffer_page_end(orig_pixels);
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@ -757,7 +757,7 @@ static int NV12TestFilter(int src_width,
  int src_height_uv = (Abs(src_height) + 1) >> 1;

  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
-  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv) * 2;
+  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv)*2;

  int src_stride_y = Abs(src_width);
  int src_stride_uv = src_width_uv * 2;
@ -775,7 +775,7 @@ static int NV12TestFilter(int src_width,
  int dst_height_uv = (dst_height + 1) >> 1;

  int64_t dst_y_plane_size = (dst_width) * (dst_height);
-  int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv) * 2;
+  int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv)*2;

  int dst_stride_y = dst_width;
  int dst_stride_uv = dst_width_uv * 2;
@ -856,7 +856,7 @@ static int NV12TestFilter(int src_width,
        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
        benchmark_cpu_info_);                                                 \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) {                 \
    int diff = I444TestFilter(                                                \
@ -864,7 +864,7 @@ static int NV12TestFilter(int src_width,
        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
        benchmark_cpu_info_);                                                 \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest, DISABLED_##I420ScaleDownBy##name##_##filter##_12) { \
    int diff = I420TestFilter_12(                                             \
@ -872,7 +872,7 @@ static int NV12TestFilter(int src_width,
        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
        benchmark_cpu_info_);                                                 \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest, DISABLED_##I444ScaleDownBy##name##_##filter##_12) { \
    int diff = I444TestFilter_12(                                             \
@ -880,7 +880,7 @@ static int NV12TestFilter(int src_width,
        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
        benchmark_cpu_info_);                                                 \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest, NV12ScaleDownBy##name##_##filter) {                 \
    int diff = NV12TestFilter(                                                \
@ -888,7 +888,7 @@ static int NV12TestFilter(int src_width,
        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom),  \
        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,           \
        benchmark_cpu_info_);                                                 \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }

 // Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
@ -931,61 +931,61 @@ TEST_FACTOR(3, 1, 3, 0)
    int diff = I420TestFilter(benchmark_width_, benchmark_height_, width,     \
                              height, kFilter##filter, benchmark_iterations_, \
                              disable_cpu_flags_, benchmark_cpu_info_);       \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest, I444##name##To##width##x##height##_##filter) {      \
    int diff = I444TestFilter(benchmark_width_, benchmark_height_, width,     \
                              height, kFilter##filter, benchmark_iterations_, \
                              disable_cpu_flags_, benchmark_cpu_info_);       \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest,                                                     \
         DISABLED_##I420##name##To##width##x##height##_##filter##_12) {       \
    int diff = I420TestFilter_12(                                             \
        benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
        benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest,                                                     \
         DISABLED_##I444##name##To##width##x##height##_##filter##_12) {       \
    int diff = I444TestFilter_12(                                             \
        benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
        benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest,                                                     \
         DISABLED_##I420##name##To##width##x##height##_##filter##_16) {       \
    int diff = I420TestFilter_16(                                             \
        benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
        benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest,                                                     \
         DISABLED_##I444##name##To##width##x##height##_##filter##_16) {       \
    int diff = I444TestFilter_16(                                             \
        benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
        benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest, NV12##name##To##width##x##height##_##filter) {      \
    int diff = NV12TestFilter(benchmark_width_, benchmark_height_, width,     \
                              height, kFilter##filter, benchmark_iterations_, \
                              disable_cpu_flags_, benchmark_cpu_info_);       \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest, I420##name##From##width##x##height##_##filter) {    \
    int diff = I420TestFilter(width, height, Abs(benchmark_width_),           \
                              Abs(benchmark_height_), kFilter##filter,        \
                              benchmark_iterations_, disable_cpu_flags_,      \
                              benchmark_cpu_info_);                           \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest, I444##name##From##width##x##height##_##filter) {    \
    int diff = I444TestFilter(width, height, Abs(benchmark_width_),           \
                              Abs(benchmark_height_), kFilter##filter,        \
                              benchmark_iterations_, disable_cpu_flags_,      \
                              benchmark_cpu_info_);                           \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest,                                                     \
         DISABLED_##I420##name##From##width##x##height##_##filter##_12) {     \
@ -993,7 +993,7 @@ TEST_FACTOR(3, 1, 3, 0)
                                 Abs(benchmark_height_), kFilter##filter,     \
                                 benchmark_iterations_, disable_cpu_flags_,   \
                                 benchmark_cpu_info_);                        \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest,                                                     \
         DISABLED_##I444##name##From##width##x##height##_##filter##_12) {     \
@ -1001,7 +1001,7 @@ TEST_FACTOR(3, 1, 3, 0)
                                 Abs(benchmark_height_), kFilter##filter,     \
                                 benchmark_iterations_, disable_cpu_flags_,   \
                                 benchmark_cpu_info_);                        \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest,                                                     \
         DISABLED_##I420##name##From##width##x##height##_##filter##_16) {     \
@ -1009,7 +1009,7 @@ TEST_FACTOR(3, 1, 3, 0)
                                 Abs(benchmark_height_), kFilter##filter,     \
                                 benchmark_iterations_, disable_cpu_flags_,   \
                                 benchmark_cpu_info_);                        \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest,                                                     \
         DISABLED_##I444##name##From##width##x##height##_##filter##_16) {     \
@ -1017,14 +1017,14 @@ TEST_FACTOR(3, 1, 3, 0)
                                 Abs(benchmark_height_), kFilter##filter,     \
                                 benchmark_iterations_, disable_cpu_flags_,   \
                                 benchmark_cpu_info_);                        \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }                                                                           \
  TEST_F(LibYUVScaleTest, NV12##name##From##width##x##height##_##filter) {    \
    int diff = NV12TestFilter(width, height, Abs(benchmark_width_),           \
                              Abs(benchmark_height_), kFilter##filter,        \
                              benchmark_iterations_, disable_cpu_flags_,      \
                              benchmark_cpu_info_);                           \
-    ASSERT_LE(diff, max_diff);                                                \
+    EXPECT_LE(diff, max_diff);                                                \
  }

 #ifndef DISABLE_SLOW_TESTS
@ -1068,49 +1068,49 @@ TEST_SCALETO(Scale, 1080, 1920)  // for rotated phones
                              benchmark_height_, benchmark_width_,         \
                              kFilter##filter, benchmark_iterations_,      \
                              disable_cpu_flags_, benchmark_cpu_info_);    \
-    ASSERT_LE(diff, max_diff);                                             \
+    EXPECT_LE(diff, max_diff);                                             \
  }                                                                        \
  TEST_F(LibYUVScaleTest, I444##name##SwapXY_##filter) {                   \
    int diff = I444TestFilter(benchmark_width_, benchmark_height_,         \
                              benchmark_height_, benchmark_width_,         \
                              kFilter##filter, benchmark_iterations_,      \
                              disable_cpu_flags_, benchmark_cpu_info_);    \
-    ASSERT_LE(diff, max_diff);                                             \
+    EXPECT_LE(diff, max_diff);                                             \
  }                                                                        \
  TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_12) {   \
    int diff = I420TestFilter_12(benchmark_width_, benchmark_height_,      \
                                 benchmark_height_, benchmark_width_,      \
                                 kFilter##filter, benchmark_iterations_,   \
                                 disable_cpu_flags_, benchmark_cpu_info_); \
-    ASSERT_LE(diff, max_diff);                                             \
+    EXPECT_LE(diff, max_diff);                                             \
  }                                                                        \
  TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_12) {   \
    int diff = I444TestFilter_12(benchmark_width_, benchmark_height_,      \
                                 benchmark_height_, benchmark_width_,      \
                                 kFilter##filter, benchmark_iterations_,   \
                                 disable_cpu_flags_, benchmark_cpu_info_); \
-    ASSERT_LE(diff, max_diff);                                             \
+    EXPECT_LE(diff, max_diff);                                             \
  }                                                                        \
  TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_16) {   \
    int diff = I420TestFilter_16(benchmark_width_, benchmark_height_,      \
                                 benchmark_height_, benchmark_width_,      \
                                 kFilter##filter, benchmark_iterations_,   \
                                 disable_cpu_flags_, benchmark_cpu_info_); \
-    ASSERT_LE(diff, max_diff);                                             \
+    EXPECT_LE(diff, max_diff);                                             \
  }                                                                        \
  TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_16) {   \
    int diff = I444TestFilter_16(benchmark_width_, benchmark_height_,      \
                                 benchmark_height_, benchmark_width_,      \
                                 kFilter##filter, benchmark_iterations_,   \
                                 disable_cpu_flags_, benchmark_cpu_info_); \
-    ASSERT_LE(diff, max_diff);                                             \
+    EXPECT_LE(diff, max_diff);                                             \
  }                                                                        \
  TEST_F(LibYUVScaleTest, NV12##name##SwapXY_##filter) {                   \
    int diff = NV12TestFilter(benchmark_width_, benchmark_height_,         \
                              benchmark_height_, benchmark_width_,         \
                              kFilter##filter, benchmark_iterations_,      \
                              disable_cpu_flags_, benchmark_cpu_info_);    \
-    ASSERT_LE(diff, max_diff);                                             \
+    EXPECT_LE(diff, max_diff);                                             \
  }

 // Test scale to a specified size with all 4 filters.
--- a/unit_test/scale_uv_test.cc
+++ b/unit_test/scale_uv_test.cc
@ -101,7 +101,7 @@ static int UVTestFilter(int src_width,
        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
        benchmark_cpu_info_);                                                \
-    ASSERT_EQ(0, diff);                                                      \
+    EXPECT_EQ(0, diff);                                                      \
  }

 #if defined(ENABLE_FULL_TESTS)
@ -132,14 +132,14 @@ TEST_FACTOR(3, 1, 3)
    int diff = UVTestFilter(benchmark_width_, benchmark_height_, width,     \
                            height, kFilter##filter, benchmark_iterations_, \
                            disable_cpu_flags_, benchmark_cpu_info_);       \
-    ASSERT_LE(diff, max_diff);                                              \
+    EXPECT_LE(diff, max_diff);                                              \
  }                                                                         \
  TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {        \
    int diff = UVTestFilter(width, height, Abs(benchmark_width_),           \
                            Abs(benchmark_height_), kFilter##filter,        \
                            benchmark_iterations_, disable_cpu_flags_,      \
                            benchmark_cpu_info_);                           \
-    ASSERT_LE(diff, max_diff);                                              \
+    EXPECT_LE(diff, max_diff);                                              \
  }

 #if defined(ENABLE_FULL_TESTS)
@ -171,7 +171,7 @@ TEST_SCALETO(UVScale, 1920, 1080)
        UVTestFilter(benchmark_width_, benchmark_height_, benchmark_height_,   \
                     benchmark_width_, kFilter##filter, benchmark_iterations_, \
                     disable_cpu_flags_, benchmark_cpu_info_);                 \
-    ASSERT_LE(diff, max_diff);                                                 \
+    EXPECT_LE(diff, max_diff);                                                 \
  }

 #if defined(ENABLE_FULL_TESTS)
@ -202,14 +202,14 @@ TEST_F(LibYUVScaleTest, UVTest3x) {
            kFilterBilinear);
  }

-  ASSERT_EQ(225, dest_pixels[0]);
-  ASSERT_EQ(255 - 225, dest_pixels[1]);
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);

  UVScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1,
          kFilterNone);

-  ASSERT_EQ(225, dest_pixels[0]);
-  ASSERT_EQ(255 - 225, dest_pixels[1]);
+  EXPECT_EQ(225, dest_pixels[0]);
+  EXPECT_EQ(255 - 225, dest_pixels[1]);

  free_aligned_buffer_page_end(dest_pixels);
  free_aligned_buffer_page_end(orig_pixels);
@ -233,14 +233,14 @@ TEST_F(LibYUVScaleTest, UVTest4x) {
            kFilterBilinear);
  }

-  ASSERT_EQ(66, dest_pixels[0]);
-  ASSERT_EQ(190, dest_pixels[1]);
+  EXPECT_EQ(66, dest_pixels[0]);
+  EXPECT_EQ(190, dest_pixels[1]);

  UVScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
          kFilterNone);

-  ASSERT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
-  ASSERT_EQ(255 - 2, dest_pixels[1]);
+  EXPECT_EQ(2, dest_pixels[0]);  // expect the 3rd pixel of the 3rd row
+  EXPECT_EQ(255 - 2, dest_pixels[1]);

  free_aligned_buffer_page_end(dest_pixels);
  free_aligned_buffer_page_end(orig_pixels);
--- a/unit_test/unit_test.cc
+++ b/unit_test/unit_test.cc
@ -169,6 +169,9 @@ static int TestCpuEnv(int cpu_info) {
  if (TestEnv("LIBYUV_DISABLE_AMXINT8")) {
    cpu_info &= ~libyuv::kCpuHasAMXINT8;
  }
+  if (TestEnv("LIBYUV_DISABLE_AVX512BMM")) {
+    cpu_info &= ~libyuv::kCpuHasAVX512BMM;
+  }
 #endif
  if (TestEnv("LIBYUV_DISABLE_ASM")) {
    cpu_info = libyuv::kCpuInitialized;
--- a/unit_test/unit_test.h
+++ b/unit_test/unit_test.h
@ -85,11 +85,10 @@ static inline bool SizeValid(int src_width,
 #define align_buffer_page_end_16(var, size)                                 \
  uint16_t* var = NULL;                                                     \
  uint8_t* var##_mem =                                                      \
-      reinterpret_cast<uint8_t*>(malloc(((size) * 2 + 4095 + 63) & ~4095)); \
+      reinterpret_cast<uint8_t*>(malloc(((size)*2 + 4095 + 63) & ~4095));   \
  if (var##_mem)                                                            \
  var = reinterpret_cast<uint16_t*>(                                        \
-      (intptr_t)(var##_mem + (((size) * 2 + 4095 + 63) & ~4095) -           \
-                 (size) * 2) &                                              \
+      (intptr_t)(var##_mem + (((size)*2 + 4095 + 63) & ~4095) - (size)*2) & \
      ~63)

 #define free_aligned_buffer_page_end_16(var) \
--- a/unit_test/video_common_test.cc
+++ b/unit_test/video_common_test.cc
@ -36,77 +36,77 @@ static bool TestValidFourCC(uint32_t fourcc, int bpp) {
 }

 TEST_F(LibYUVBaseTest, TestCanonicalFourCC) {
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_I420), CanonicalFourCC(FOURCC_IYUV));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_I420), CanonicalFourCC(FOURCC_YU12));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_I422), CanonicalFourCC(FOURCC_YU16));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_I444), CanonicalFourCC(FOURCC_YU24));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUYV));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUVS));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_UYVY), CanonicalFourCC(FOURCC_HDYC));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_UYVY), CanonicalFourCC(FOURCC_2VUY));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_MJPG), CanonicalFourCC(FOURCC_JPEG));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_MJPG), CanonicalFourCC(FOURCC_DMB1));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_RAW), CanonicalFourCC(FOURCC_RGB3));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_24BG), CanonicalFourCC(FOURCC_BGR3));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_BGRA), CanonicalFourCC(FOURCC_CM32));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_RAW), CanonicalFourCC(FOURCC_CM24));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_RGBO), CanonicalFourCC(FOURCC_L555));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_RGBP), CanonicalFourCC(FOURCC_L565));
-  ASSERT_EQ(static_cast<uint32_t>(FOURCC_RGBO), CanonicalFourCC(FOURCC_5551));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_I420), CanonicalFourCC(FOURCC_IYUV));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_I420), CanonicalFourCC(FOURCC_YU12));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_I422), CanonicalFourCC(FOURCC_YU16));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_I444), CanonicalFourCC(FOURCC_YU24));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUYV));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUVS));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_UYVY), CanonicalFourCC(FOURCC_HDYC));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_UYVY), CanonicalFourCC(FOURCC_2VUY));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_MJPG), CanonicalFourCC(FOURCC_JPEG));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_MJPG), CanonicalFourCC(FOURCC_DMB1));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_RAW), CanonicalFourCC(FOURCC_RGB3));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_24BG), CanonicalFourCC(FOURCC_BGR3));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_BGRA), CanonicalFourCC(FOURCC_CM32));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_RAW), CanonicalFourCC(FOURCC_CM24));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_RGBO), CanonicalFourCC(FOURCC_L555));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_RGBP), CanonicalFourCC(FOURCC_L565));
+  EXPECT_EQ(static_cast<uint32_t>(FOURCC_RGBO), CanonicalFourCC(FOURCC_5551));
 }

 TEST_F(LibYUVBaseTest, TestFourCC) {
-  ASSERT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_I422, FOURCC_BPP_I422));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_I444, FOURCC_BPP_I444));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_I400, FOURCC_BPP_I400));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_NV21, FOURCC_BPP_NV21));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));  // deprecated.
-  ASSERT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420));  // deprecated.
-  ASSERT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_AR30, FOURCC_BPP_AR30));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_AB30, FOURCC_BPP_AB30));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_AR64, FOURCC_BPP_AR64));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_AB64, FOURCC_BPP_AB64));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_R444, FOURCC_BPP_R444));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_H420, FOURCC_BPP_H420));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_H422, FOURCC_BPP_H422));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_H010, FOURCC_BPP_H010));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_H210, FOURCC_BPP_H210));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_I010, FOURCC_BPP_I010));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_I210, FOURCC_BPP_I210));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_P010, FOURCC_BPP_P010));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_P210, FOURCC_BPP_P210));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YV24, FOURCC_BPP_YV24));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YU12, FOURCC_BPP_YU12));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_IYUV, FOURCC_BPP_IYUV));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YU16, FOURCC_BPP_YU16));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YU24, FOURCC_BPP_YU24));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YUYV, FOURCC_BPP_YUYV));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_YUVS, FOURCC_BPP_YUVS));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_HDYC, FOURCC_BPP_HDYC));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_2VUY, FOURCC_BPP_2VUY));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_JPEG, FOURCC_BPP_JPEG));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_DMB1, FOURCC_BPP_DMB1));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_BA81, FOURCC_BPP_BA81));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_RGB3, FOURCC_BPP_RGB3));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_BGR3, FOURCC_BPP_BGR3));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_H264, FOURCC_BPP_H264));
-  ASSERT_TRUE(TestValidFourCC(FOURCC_ANY, FOURCC_BPP_ANY));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I422, FOURCC_BPP_I422));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I444, FOURCC_BPP_I444));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I400, FOURCC_BPP_I400));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_NV21, FOURCC_BPP_NV21));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));  // deprecated.
+  EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420));  // deprecated.
+  EXPECT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_AR30, FOURCC_BPP_AR30));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_AB30, FOURCC_BPP_AB30));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_AR64, FOURCC_BPP_AR64));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_AB64, FOURCC_BPP_AB64));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_R444, FOURCC_BPP_R444));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H420, FOURCC_BPP_H420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H422, FOURCC_BPP_H422));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H010, FOURCC_BPP_H010));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H210, FOURCC_BPP_H210));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I010, FOURCC_BPP_I010));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I210, FOURCC_BPP_I210));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_P010, FOURCC_BPP_P010));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_P210, FOURCC_BPP_P210));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YV24, FOURCC_BPP_YV24));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YU12, FOURCC_BPP_YU12));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_IYUV, FOURCC_BPP_IYUV));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YU16, FOURCC_BPP_YU16));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YU24, FOURCC_BPP_YU24));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YUYV, FOURCC_BPP_YUYV));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_YUVS, FOURCC_BPP_YUVS));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_HDYC, FOURCC_BPP_HDYC));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_2VUY, FOURCC_BPP_2VUY));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_JPEG, FOURCC_BPP_JPEG));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_DMB1, FOURCC_BPP_DMB1));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_BA81, FOURCC_BPP_BA81));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_RGB3, FOURCC_BPP_RGB3));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_BGR3, FOURCC_BPP_BGR3));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H264, FOURCC_BPP_H264));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_ANY, FOURCC_BPP_ANY));
 }

 }  // namespace libyuv
--- a/util/cpuid.c
+++ b/util/cpuid.c
@ -15,6 +15,8 @@
 #ifdef __linux__
 #include <ctype.h>
 #include <sys/utsname.h>
+#include <signal.h>
+#include <setjmp.h>
 #endif

 #include "libyuv/cpu_id.h"
@ -40,6 +42,14 @@ static void KernelVersion(int* version) {
 }
 #endif

+#ifdef __linux__
+static sigjmp_buf vdpphps_jmpbuf;
+static void vdpphps_sigill_handler(int sig) {
+  (void)sig;
+  siglongjmp(vdpphps_jmpbuf, 1);
+}
+#endif
+
 int main(int argc, const char* argv[]) {
  (void)argc;
  (void)argv;
@ -182,6 +192,7 @@ int main(int argc, const char* argv[]) {
    int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI);
    int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8);
    int has_amxint8 = TestCpuFlag(kCpuHasAMXINT8);
+    int has_avx512bmm = TestCpuFlag(kCpuHasAVX512BMM);
    printf("Has X86 0x%x\n", has_x86);
    printf("Has SSE2 0x%x\n", has_sse2);
    printf("Has SSSE3 0x%x\n", has_ssse3);
@ -204,6 +215,30 @@ int main(int argc, const char* argv[]) {
    printf("HAS AVXVNNI 0x%x\n", has_avxvnni);
    printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
    printf("Has AMXINT8 0x%x\n", has_amxint8);
+    printf("Has AVX512BMM 0x%x\n", has_avx512bmm);
+
+#ifdef __linux__
+    // Test VDPPHPS instruction
+    {
+      struct sigaction act, oldact;
+      memset(&act, 0, sizeof(act));
+      act.sa_handler = vdpphps_sigill_handler;
+      sigaction(SIGILL, &act, &oldact);
+
+      printf("Testing VDPPHPS instruction... ");
+      fflush(stdout);
+
+      if (sigsetjmp(vdpphps_jmpbuf, 1) == 0) {
+        // VDPPHPS xmm0, xmm0, xmm0
+        __asm__ volatile("vdpphps %%xmm0, %%xmm0, %%xmm0" : : : "xmm0");
+        printf("Works!\n");
+      } else {
+        printf("Crashed (SIGILL)!\n");
+      }
+
+      sigaction(SIGILL, &oldact, NULL);
+    }
+#endif
  }
 #endif  // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) ||
        // defined(_M_X64)
--- a/util/ssim.cc
+++ b/util/ssim.cc
@ -244,23 +244,23 @@ double GetSSIMFullKernel(const uint8_t* org,

 // Read 8 pixels at line #L, and convert to 16bit, perform weighting
 // and acccumulate.
-#define LOAD_LINE_PAIR(L, WEIGHT)                                              \
-  do {                                                                         \
-    const __m128i v0 =                                                         \
-        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
-    const __m128i v1 =                                                         \
-        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
-    const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                            \
-    const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                            \
-    const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);              \
-    const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);              \
-    x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                       \
-    y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                       \
-    x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                       \
-    y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                       \
-    xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                           \
-    xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                           \
-    yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                           \
+#define LOAD_LINE_PAIR(L, WEIGHT)                                            \
+  do {                                                                       \
+    const __m128i v0 =                                                       \
+        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \
+    const __m128i v1 =                                                       \
+        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \
+    const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                          \
+    const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                          \
+    const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);            \
+    const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);            \
+    x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                     \
+    y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                     \
+    x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                     \
+    y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                     \
+    xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                         \
+    xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                         \
+    yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                         \
  } while (0)

 #define ADD_AND_STORE_FOUR_EPI32(M, OUT)                    \