[libyuv] Remove all x86 SSE optimizations

Removed all SSE functions, macros, dispatching logic, and related unit tests across the repository to reduce code size and complexity. Left cpuid detection intact. Supported architectures like AVX2, NEON, SVE, etc. are unaffected. R=rrwinterton@gmail.com Bug: None Test: Build and run libyuv_unittest Change-Id: Id19608dba35b79c4c8fc31f920a6a968883d300f
2026-04-30 19:09:18 +08:00 · 2026-04-29 17:06:56 -07:00 · 2026-04-29 17:06:56 -07:00 · 36e0fd216b
commit 36e0fd216b
parent f2ac6db694
29 changed files with 2031 additions and 2357 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1934
+Version: 1928
 Revision: DEPS
 License: BSD-3-Clause
 License File: LICENSE
--- a/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@ -456,40 +456,6 @@ int ARGBToUYVY(const uint8_t* src_argb,
               int width,
               int height);

-// RAW to NV21 with Matrix
-LIBYUV_API
-int RAWToNV21Matrix(const uint8_t* src_raw,
-                    int src_stride_raw,
-                    uint8_t* dst_y,
-                    int dst_stride_y,
-                    uint8_t* dst_vu,
-                    int dst_stride_vu,
-                    const struct ArgbConstants* argbconstants,
-                    int width,
-                    int height);
-
-// RAW to NV21
-LIBYUV_API
-int RAWToNV21(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_vu,
-              int dst_stride_vu,
-              int width,
-              int height);
-
-// RGB24 to NV12
-LIBYUV_API
-int RGB24ToNV12(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_y,
-                int dst_stride_y,
-                uint8_t* dst_uv,
-                int dst_stride_uv,
-                int width,
-                int height);
-
 // RAW to JNV21 full range NV21
 LIBYUV_API
 int RAWToJNV21(const uint8_t* src_raw,
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -487,9 +487,6 @@ int NV21ToNV12(const uint8_t* src_y,
               int width,
               int height);

-// Alias
-#define NV12ToNV21 NV21ToNV12
-
 LIBYUV_API
 int YUY2ToY(const uint8_t* src_yuy2,
            int src_stride_yuy2,
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -140,13 +140,6 @@ extern "C" {

 // The following are available on all x86 platforms, but
 // require VS2012, clang 3.4 or gcc 4.7.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__) || \
-     defined(_M_X64) || defined(_M_X86))
-#define HAS_ARGBTOUVMATRIXROW_AVX2
-#define HAS_MERGEUVROW_AVX2
-#endif
-
 #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
     defined(GCC_HAS_AVX2))
@ -170,6 +163,7 @@ extern "C" {
 #define HAS_I444TORGB24ROW_AVX2
 #define HAS_INTERPOLATEROW_AVX2
 #define HAS_J422TOARGBROW_AVX2
+#define HAS_MERGEUVROW_AVX2
 #define HAS_MIRRORROW_AVX2
 #define HAS_NV12TOARGBROW_AVX2
 #define HAS_NV12TORGB24ROW_AVX2
@ -200,6 +194,7 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
    (defined(__x86_64__) || defined(__i386__)) && \
    !defined(LIBYUV_ENABLE_ROWWIN)
+#define HAS_RAWTOYJROW_SSSE3
 #define HAS_AB64TOARGBROW_SSSE3
 #define HAS_ABGRTOAR30ROW_SSSE3
 #define HAS_ABGRTOYJROW_SSSE3
@ -250,9 +245,11 @@ extern "C" {
 // TODO: port row_win to use 8 bit coefficients.
 #define HAS_ARGBTOYJROW_SSSE3
 #define HAS_ARGBTOYROW_SSSE3
-#define HAS_ARGBTOYMATRIXROW_SSSE3
 #define HAS_BGRATOYROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
 #define HAS_ABGRTOYROW_SSSE3
+#define HAS_RGB24TOYJROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
 #define HAS_RGBATOYROW_SSSE3

 // TODO: adjust row_win to use 8 bit negative coefficients.
@ -300,7 +297,6 @@ extern "C" {
 #define HAS_ARGBTOUV444MATRIXROW_AVX2
 #define HAS_ARGBTOYJROW_AVX2
 #define HAS_ARGBTOYROW_AVX2
-#define HAS_ARGBTOYMATRIXROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
@ -334,6 +330,8 @@ extern "C" {
 #define HAS_P210TOARGBROW_AVX2
 #define HAS_P410TOAR30ROW_AVX2
 #define HAS_P410TOARGBROW_AVX2
+#define HAS_RAWTOYJROW_AVX2
+#define HAS_RGB24TOYJROW_AVX2
 #define HAS_RGBATOYJROW_AVX2
 #define HAS_SPLITARGBROW_AVX2
 #define HAS_SPLITRGBROW_AVX2
@ -356,13 +354,7 @@ extern "C" {
     defined(_M_X64) || defined(_M_X86)) && \
    ((defined(_MSC_VER) && !defined(__clang__)) || \
     defined(LIBYUV_ENABLE_ROWWIN))
-#define HAS_RAWTOARGBROW_AVX2
-#if defined(__x86_64__) || defined(_M_X64)
-#define HAS_RAWTOARGBROW_AVX512BW
-#define HAS_RGB24TOARGBROW_AVX512BW
-#endif
 #define HAS_ARGBTOYROW_AVX2
-#define HAS_ARGBTOYMATRIXROW_AVX2
 #define HAS_ABGRTOYROW_AVX2
 #define HAS_ARGBTOYJROW_AVX2
 #define HAS_ABGRTOYJROW_AVX2
@ -378,10 +370,6 @@ extern "C" {
    (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512) && \
    !defined(LIBYUV_ENABLE_ROWWIN)
 #define HAS_COPYROW_AVX512BW
-#if defined(__x86_64__) || defined(_M_X64)
-#define HAS_RAWTOARGBROW_AVX512BW
-#define HAS_RGB24TOARGBROW_AVX512BW
-#endif
 #define HAS_ARGBTORGB24ROW_AVX512VBMI
 #define HAS_CONVERT16TO8ROW_AVX512BW
 #define HAS_MERGEUVROW_AVX512BW
@ -395,7 +383,6 @@ extern "C" {
 #define HAS_ARGBTOUV444ROW_AVX512BW
 #define HAS_ARGBTOUV444MATRIXROW_AVX512BW
 #define HAS_ARGBTOYROW_AVX512BW
-#define HAS_ARGBTOYMATRIXROW_AVX512BW
 #define HAS_ARGBTOUVJ444ROW_AVX512BW
 #define HAS_ARGBTOUVROW_AVX512BW
 #define HAS_ARGBTOUVJROW_AVX512BW
@ -433,7 +420,6 @@ extern "C" {
 #define HAS_ARGBTOUV444ROW_NEON
 #define HAS_ARGBTOUVJ444ROW_NEON
 #define HAS_ARGBTOUVJROW_NEON
-#define HAS_ARGBTOUVMATRIXROW_NEON
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
 #if !defined(__aarch64__)
@ -496,9 +482,13 @@ extern "C" {
 #define HAS_RAWTORGBAROW_NEON
 #define HAS_RAWTOUVJROW_NEON
 #define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYJROW_NEON
+#define HAS_RAWTOYROW_NEON
 #define HAS_RGB24TOARGBROW_NEON
 #define HAS_RGB24TOUVJROW_NEON
 #define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYJROW_NEON
+#define HAS_RGB24TOYROW_NEON
 #define HAS_RGB565TOARGBROW_NEON
 #define HAS_RGB565TOUVROW_NEON
 #define HAS_RGB565TOYROW_NEON
@ -569,7 +559,6 @@ extern "C" {
 #define HAS_ARGBSEPIAROW_NEON_DOTPROD
 #define HAS_ARGBTOYJROW_NEON_DOTPROD
 #define HAS_ARGBTOYROW_NEON_DOTPROD
-#define HAS_ARGBTOYMATRIXROW_NEON_DOTPROD
 #define HAS_BGRATOYROW_NEON_DOTPROD
 #define HAS_RGBATOYJROW_NEON_DOTPROD
 #define HAS_RGBATOYROW_NEON_DOTPROD
@ -580,7 +569,6 @@ extern "C" {
 #define HAS_ARGBTOUV444ROW_NEON_I8MM
 #define HAS_ARGBTOUVJ444ROW_NEON_I8MM
 #define HAS_ARGBTOUVJROW_NEON_I8MM
-#define HAS_ARGBTOUVMATRIXROW_NEON_I8MM
 #define HAS_ARGBTOUVROW_NEON_I8MM
 #define HAS_BGRATOUVROW_NEON_I8MM
 #define HAS_RGBATOUVROW_NEON_I8MM
@ -596,7 +584,6 @@ extern "C" {
 #define HAS_ARGBTORGB565DITHERROW_SVE2
 #define HAS_ARGBTORGB565ROW_SVE2
 #define HAS_ARGBTOUVJROW_SVE2
-#define HAS_ARGBTOUVMATRIXROW_SVE2
 #define HAS_ARGBTOUVROW_SVE2
 #define HAS_AYUVTOUVROW_SVE2
 #define HAS_AYUVTOVUROW_SVE2
@ -648,7 +635,6 @@ extern "C" {
 #define HAS_ABGRTOUVROW_SME
 #define HAS_ARGBMULTIPLYROW_SME
 #define HAS_ARGBTOUVJROW_SME
-#define HAS_ARGBTOUVMATRIXROW_SME
 #define HAS_ARGBTOUVROW_SME
 #define HAS_BGRATOUVROW_SME
 #define HAS_CONVERT16TO8ROW_SME
@ -757,8 +743,10 @@ extern "C" {
 #define HAS_RAWTOARGBROW_LSX
 #define HAS_RAWTORGB24ROW_LSX
 #define HAS_RAWTOUVROW_LSX
+#define HAS_RAWTOYROW_LSX
 #define HAS_RGB24TOARGBROW_LSX
 #define HAS_RGB24TOUVROW_LSX
+#define HAS_RGB24TOYROW_LSX
 #define HAS_RGB565TOARGBROW_LSX
 #define HAS_RGB565TOUVROW_LSX
 #define HAS_RGB565TOYROW_LSX
@ -778,9 +766,10 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_LSX
 #define HAS_YUY2TOYROW_LSX
 #define HAS_ARGBTOYROW_LSX
-#define HAS_ARGBTOYMATRIXROW_LSX
 #define HAS_ABGRTOYJROW_LSX
 #define HAS_RGBATOYJROW_LSX
+#define HAS_RGB24TOYJROW_LSX
+#define HAS_RAWTOYJROW_LSX
 #endif

 #if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
@ -813,7 +802,6 @@ extern "C" {
 #define HAS_ARGBTOUVROW_LASX
 #define HAS_ARGBTOYJROW_LASX
 #define HAS_ARGBTOYROW_LASX
-#define HAS_ARGBTOYMATRIXROW_LASX
 #define HAS_ABGRTOYJROW_LASX
 #define HAS_ABGRTOYROW_LASX
 #define HAS_I422ALPHATOARGBROW_LASX
@ -832,8 +820,10 @@ extern "C" {
 #define HAS_NV21TOARGBROW_LASX
 #define HAS_RAWTOARGBROW_LASX
 #define HAS_RAWTOUVROW_LASX
+#define HAS_RAWTOYROW_LASX
 #define HAS_RGB24TOARGBROW_LASX
 #define HAS_RGB24TOUVROW_LASX
+#define HAS_RGB24TOYROW_LASX
 #define HAS_RGB565TOARGBROW_LASX
 #define HAS_RGB565TOUVROW_LASX
 #define HAS_RGB565TOYROW_LASX
@ -846,6 +836,8 @@ extern "C" {
 #define HAS_RGBATOYROW_LASX
 #define HAS_RGBATOYJROW_LASX
 #define HAS_BGRATOYROW_LASX
+#define HAS_RGB24TOYJROW_LASX
+#define HAS_RAWTOYJROW_LASX
 #endif

 #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
@ -875,6 +867,10 @@ extern "C" {
 #define HAS_BGRATOYROW_RVV
 #define HAS_COPYROW_RVV
 #define HAS_INTERPOLATEROW_RVV
+#define HAS_RAWTOYJROW_RVV
+#define HAS_RAWTOYROW_RVV
+#define HAS_RGB24TOYJROW_RVV
+#define HAS_RGB24TOYROW_RVV
 #define HAS_RGBATOYJROW_RVV
 #define HAS_RGBATOYMATRIXROW_RVV
 #define HAS_RGBATOYROW_RVV
@ -896,7 +892,8 @@ extern "C" {
 //  __riscv_vcreate_v_u8m2x3
 //  __riscv_vcreate_v_u8m2x4
 //  __riscv_vcreate_v_u8m4x2
-#if defined(LIBYUV_RVV_HAS_VCREATE)
+#if !defined(LIBYUV_RVV_HAS_TUPLE_TYPE) || \
+    (defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VCREATE))
 #define HAS_AB64TOARGBROW_RVV
 #define HAS_AR64TOAB64ROW_RVV
 #define HAS_ARGBATTENUATEROW_RVV
@ -1779,6 +1776,12 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void RGBAToYRow_AVX512BW(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void RGBAToYRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width);
 void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
@ -1844,43 +1847,6 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
                               uint8_t* dst_u,
                               uint8_t* dst_v,
                               int width);
-void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
-                            int src_stride_argb,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width,
-                            const struct ArgbConstants* c);
-void ARGBToUVMatrixRow_Any_NEON(const uint8_t* src_argb,
-                                int src_stride_argb,
-                                uint8_t* dst_u,
-                                uint8_t* dst_v,
-                                int width,
-                                const struct ArgbConstants* c);
-void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
-                                 int src_stride_argb,
-                                 uint8_t* dst_u,
-                                 uint8_t* dst_v,
-                                 int width,
-                                 const struct ArgbConstants* c);
-void ARGBToUVMatrixRow_Any_NEON_I8MM(const uint8_t* src_argb,
-                                     int src_stride_argb,
-                                     uint8_t* dst_u,
-                                     uint8_t* dst_v,
-                                     int width,
-                                     const struct ArgbConstants* c);
-void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
-                            int src_stride_argb,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width,
-                            const struct ArgbConstants* c);
-void ARGBToUVMatrixRow_SME(const uint8_t* src_argb,
-                           int src_stride_argb,
-                           uint8_t* dst_u,
-                           uint8_t* dst_v,
-                           int width,
-                           const struct ArgbConstants* c);
-
 void ARGBToUVRow_NEON(const uint8_t* src_argb,
                      int src_stride_argb,
                      uint8_t* dst_u,
@ -2131,6 +2097,10 @@ void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr,
 void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba,
                             uint8_t* dst_y,
                             int width);
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width);
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
                         uint8_t* dst_y,
@ -2141,19 +2111,31 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
 void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width);
 void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width);

 void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width);
 void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
 void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555,
                         uint8_t* dst_y,
                         int width);
 void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width);
 void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
+void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width);
+void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width);

 void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
@ -2215,42 +2197,6 @@ void ARGBToYMatrixRow_Any_AVX512BW(const uint8_t* src_argb,
                                   int width,
                                   const struct ArgbConstants* c);

-void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c);
-
-void ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb,
-                                   uint8_t* dst_y,
-                                   int width,
-                                   const struct ArgbConstants* c);
-void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c);
-void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
-                           uint8_t* dst_y,
-                           int width,
-                           const struct ArgbConstants* c);
-void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
-                          uint8_t* dst_y,
-                          int width,
-                          const struct ArgbConstants* c);
-
-void ARGBToYMatrixRow_Any_NEON_DotProd(const uint8_t* src_argb,
-                                       uint8_t* dst_y,
-                                       int width,
-                                       const struct ArgbConstants* c);
-void ARGBToYMatrixRow_Any_LSX(const uint8_t* src_argb,
-                              uint8_t* dst_y,
-                              int width,
-                              const struct ArgbConstants* c);
-void ARGBToYMatrixRow_Any_LASX(const uint8_t* src_argb,
-                               uint8_t* dst_y,
-                               int width,
-                               const struct ArgbConstants* c);
-
-
 void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
                                uint8_t* dst_u,
                                uint8_t* dst_v,
@ -2305,6 +2251,10 @@ void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RGB24ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RGB24ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RAWToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
+void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
 void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
@ -2324,6 +2274,14 @@ void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_Any_NEON_DotProd(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
@ -2352,6 +2310,10 @@ void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYRow_Any_NEON_DotProd(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int width);
+void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
@ -2365,21 +2327,29 @@ void ABGRToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB565ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGB1555ToYRow_Any_LSX(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            int width);

 void RGB565ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGBAToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void BGRAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGB1555ToYRow_Any_LASX(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
@ -4029,7 +3999,6 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
                          int width);
 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width);
-void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
@ -4121,9 +4090,6 @@ void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            int width);
 void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
-void RGB24ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            int width);
--- a/include/libyuv/row_sve.h
+++ b/include/libyuv/row_sve.h
@ -2019,7 +2019,7 @@ static const int8_t kABGRToUVJCoefficients[] = {
    43, 85, -128, 0, -128, 107, 21, 0,
 };

-#define ARGBTOUVMATRIX_SVE                                                  \
+#define ABCDTOUVMATRIX_SVE                                                  \
  "ld1d     {z0.d}, p1/z, [%[src0]]               \n" /* ABCD(bgra) */      \
  "ld1d     {z1.d}, p2/z, [%[src0], #1, mul vl]   \n" /* EFGH(bgra) */      \
  "ld1d     {z2.d}, p3/z, [%[src0], #2, mul vl]   \n" /* IJKL(bgra) */      \
@ -2113,7 +2113,7 @@ static inline void ARGBToUVMatrixRow_SVE_SC(const uint8_t* src_argb,
      "ptrue  p4.d                                   \n"
      "ptrue  p5.h                                   \n"
      "1:                                            \n"  //
-      ARGBTOUVMATRIX_SVE
+      ABCDTOUVMATRIX_SVE
      "b.gt     1b                                   \n"

      "2:                                            \n"
@ -2126,7 +2126,7 @@ static inline void ARGBToUVMatrixRow_SVE_SC(const uint8_t* src_argb,
      "whilelt  p3.d, %w[vl2], %w[width]             \n"
      "whilelt  p4.d, %w[vl3], %w[width]             \n"
      "whilelt  p5.h, wzr, %w[width]                 \n"  //
-      ARGBTOUVMATRIX_SVE
+      ABCDTOUVMATRIX_SVE
      "b.gt     3b                                   \n"

      "99:                                           \n"
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1934
+#define LIBYUV_VERSION 1928

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/psnr.o
+++ b/psnr.o
--- a/source/convert.cc
+++ b/source/convert.cc
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -3638,22 +3638,6 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
    }
  }
 #endif
-#if defined(HAS_RGB24TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_RGB24TOARGBROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW;
-    }
-  }
-#endif
 #if defined(HAS_RGB24TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
@ -3688,7 +3672,8 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
    RGB24ToARGBRow = RGB24ToARGBRow_RVV;
  }
 #endif
-for (y = 0; y < height; ++y) {
+
+  for (y = 0; y < height; ++y) {
    RGB24ToARGBRow(src_rgb24, dst_argb, width);
    src_rgb24 += src_stride_rgb24;
    dst_argb += dst_stride_argb;
@ -3738,14 +3723,6 @@ int RAWToARGB(const uint8_t* src_raw,
    }
  }
 #endif
-#if defined(HAS_RAWTOARGBROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    RAWToARGBRow = RAWToARGBRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      RAWToARGBRow = RAWToARGBRow_AVX512BW;
-    }
-  }
-#endif
 #if defined(HAS_RAWTOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    RAWToARGBRow = RAWToARGBRow_Any_NEON;
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@ -199,70 +199,7 @@ int ARGBToI444Matrix(const uint8_t* src_argb,
  void (*ARGBToUV444MatrixRow)(const uint8_t* src_argb, uint8_t* dst_u,
                               uint8_t* dst_v, int width,
                               const struct ArgbConstants* c) =
-ARGBToUV444MatrixRow_C;
-
-#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
-  if (TestCpuFlag(kCpuHasNeonDotProd)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_RVV)
-  if (TestCpuFlag(kCpuHasRVV)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
-  }
-#endif
-
+      ARGBToUV444MatrixRow_C;
 #if defined(HAS_ARGBTOUV444MATRIXROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_SSSE3;
@ -287,6 +224,14 @@ ARGBToUV444MatrixRow_C;
    }
  }
 #endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOUV444MATRIXROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_NEON;
@ -510,96 +455,7 @@ int ARGBToI422Matrix(const uint8_t* src_argb,
  void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                            uint8_t* dst_u, uint8_t* dst_v, int width,
                            const struct ArgbConstants* c) =
-ARGBToUVMatrixRow_C;
-
-#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
-  if (TestCpuFlag(kCpuHasNeonDotProd)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_RVV)
-  if (TestCpuFlag(kCpuHasRVV)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
-  }
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
-      }
-    }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
-      }
-    }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-    if (TestCpuFlag(kCpuHasSVE2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
-    }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SME)
-    if (TestCpuFlag(kCpuHasSME)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
-    }
-#endif
+      ARGBToUVMatrixRow_C;
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
@ -615,6 +471,14 @@ ARGBToUVMatrixRow_C;
      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
    }
  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+    }
+  }
 #endif
  if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 ||
      height == 0) {
@ -795,7 +659,7 @@ int ARGBToNV12(const uint8_t* src_argb,
 #if defined(HAS_MERGEUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
      MergeUVRow = MergeUVRow_AVX2;
    }
  }
@ -877,96 +741,7 @@ int ARGBToNV12Matrix(const uint8_t* src_argb,
  void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                            uint8_t* dst_u, uint8_t* dst_v, int width,
                            const struct ArgbConstants* c) =
-ARGBToUVMatrixRow_C;
-
-#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
-  if (TestCpuFlag(kCpuHasNeonDotProd)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_RVV)
-  if (TestCpuFlag(kCpuHasRVV)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
-  }
-#endif
-
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
-      }
-    }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
-      }
-    }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-    if (TestCpuFlag(kCpuHasSVE2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
-    }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SME)
-    if (TestCpuFlag(kCpuHasSME)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
-    }
-#endif
+      ARGBToUVMatrixRow_C;
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
@ -982,6 +757,14 @@ ARGBToUVMatrixRow_C;
      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
    }
  }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+    }
+  }
 #endif
  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
                     uint8_t* dst_uv, int width) = MergeUVRow_C;
@ -1006,7 +789,7 @@ ARGBToUVMatrixRow_C;
 #if defined(HAS_MERGEUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
      MergeUVRow = MergeUVRow_AVX2;
    }
  }
@ -1240,7 +1023,7 @@ int ARGBToNV21(const uint8_t* src_argb,
 #if defined(HAS_MERGEUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
      MergeUVRow = MergeUVRow_AVX2;
    }
  }
@ -1460,7 +1243,7 @@ int ABGRToNV12(const uint8_t* src_abgr,
 #if defined(HAS_MERGEUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
      MergeUVRow = MergeUVRow_AVX2;
    }
  }
@ -1673,7 +1456,7 @@ int ABGRToNV21(const uint8_t* src_abgr,
 #if defined(HAS_MERGEUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
      MergeUVRow = MergeUVRow_AVX2;
    }
  }
@ -4117,93 +3900,41 @@ int ARGBToAB64(const uint8_t* src_argb,
  return 0;
 }

-// Convert RAW to NV21 with Matrix.
+// Enabled if 1 pass is available
+#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_RVV)
+#define HAS_RAWTOYJROW
+#endif
+
+// RAW to JNV21 full range NV21
 LIBYUV_API
-int RAWToNV21Matrix(const uint8_t* src_raw,
-                    int src_stride_raw,
-                    uint8_t* dst_y,
-                    int dst_stride_y,
-                    uint8_t* dst_vu,
-                    int dst_stride_vu,
-                    const struct ArgbConstants* argbconstants,
-                    int width,
-                    int height) {
+int RAWToJNV21(const uint8_t* src_raw,
+               int src_stride_raw,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
  int y;
  int halfwidth = (width + 1) >> 1;
+#if defined(HAS_RAWTOYJROW)
+  void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
+                      uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+      RAWToUVJRow_C;
+  void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+      RAWToYJRow_C;
+#else
  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
      RAWToARGBRow_C;
-  void (*ARGBToUVMatrixRow)(const uint8_t* src_argb0, int src_stride_argb,
-                            uint8_t* dst_u, uint8_t* dst_v, int width,
-                            const struct ArgbConstants* c) =
-      ARGBToUVMatrixRow_C;
-  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
-                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_uj, uint8_t* dst_vj, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYJRow_C;
+#endif
  void (*MergeUVRow)(const uint8_t* src_uj, const uint8_t* src_vj,
                      uint8_t* dst_vu, int width) = MergeUVRow_C;
-#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
-  if (TestCpuFlag(kCpuHasNeonDotProd)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_RVV)
-  if (TestCpuFlag(kCpuHasRVV)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
-  }
-#endif
-
-
-  if (!src_raw || !dst_y || !dst_vu || !argbconstants || width <= 0 || height == 0) {
+  if (!src_raw || !dst_y || !dst_vu || width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
@ -4213,6 +3944,44 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
    src_stride_raw = -src_stride_raw;
  }

+#if defined(HAS_RAWTOYJROW)
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToUVJRow = RAWToUVJRow_Any_NEON;
+    RAWToYJRow = RAWToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_NEON;
+      RAWToUVJRow = RAWToUVJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_LSX)
+  if (TestCpuFlag(kCpuHasLSX)) {
+    RAWToYJRow = RAWToYJRow_Any_LSX;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYJRow = RAWToYJRow_LSX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_LASX)
+  if (TestCpuFlag(kCpuHasLASX)) {
+    RAWToYJRow = RAWToYJRow_Any_LASX;
+    if (IS_ALIGNED(width, 32)) {
+      RAWToYJRow = RAWToYJRow_LASX;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOYJROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    RAWToYJRow = RAWToYJRow_RVV;
+  }
+#endif
+
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else  // HAS_RAWTOYJROW
+
 #if defined(HAS_RAWTOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@ -4229,99 +3998,47 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
    }
  }
 #endif
-#if defined(HAS_RAWTOARGBROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    RAWToARGBRow = RAWToARGBRow_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      RAWToARGBRow = RAWToARGBRow_AVX512BW;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    RAWToARGBRow = RAWToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RAWToARGBRow = RAWToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOARGBROW_SVE2)
-  if (TestCpuFlag(kCpuHasSVE2)) {
-    RAWToARGBRow = RAWToARGBRow_SVE2;
-  }
-#endif
-#if defined(HAS_RAWTOARGBROW_LSX)
-  if (TestCpuFlag(kCpuHasLSX)) {
-    RAWToARGBRow = RAWToARGBRow_Any_LSX;
-    if (IS_ALIGNED(width, 16)) {
-      RAWToARGBRow = RAWToARGBRow_LSX;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOARGBROW_LASX)
-  if (TestCpuFlag(kCpuHasLASX)) {
-    RAWToARGBRow = RAWToARGBRow_Any_LASX;
-    if (IS_ALIGNED(width, 32)) {
-      RAWToARGBRow = RAWToARGBRow_LASX;
-    }
-  }
-#endif
-#if defined(HAS_RAWTOARGBROW_RVV)
-  if (TestCpuFlag(kCpuHasRVV)) {
-    RAWToARGBRow = RAWToARGBRow_RVV;
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
-      }
-    }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
-      }
-    }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-    if (TestCpuFlag(kCpuHasSVE2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
-    }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SME)
-    if (TestCpuFlag(kCpuHasSME)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
-    }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
    }
  }
 #endif
-#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX512BW;
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW;
+      ARGBToYJRow = ARGBToYJRow_AVX2;
    }
  }
 #endif
-
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX512BW;
+    }
+  }
+#endif
+#endif  // HAS_RAWTOYJROW
 #if defined(HAS_MERGEUVROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    MergeUVRow = MergeUVRow_Any_SSE2;
@ -4333,7 +4050,7 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
 #if defined(HAS_MERGEUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(halfwidth, 32)) {
+    if (IS_ALIGNED(halfwidth, 16)) {
      MergeUVRow = MergeUVRow_AVX2;
    }
  }
@ -4372,86 +4089,58 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
    MergeUVRow = MergeUVRow_RVV;
  }
 #endif
-
  {
-    // Allocate 2 rows of ARGB.
-    const int row_size = (width * 4 + 31) & ~31;
-    align_buffer_64(row, row_size * 2);
-    // Allocate 1 row of U and 1 row of V.
-    align_buffer_64(row_u, halfwidth);
-    align_buffer_64(row_v, halfwidth);
-
-    if (!row || !row_u || !row_v) {
-      free_aligned_buffer_64(row);
-      free_aligned_buffer_64(row_u);
-      free_aligned_buffer_64(row_v);
+#if defined(HAS_RAWTOYJROW)
+    // Allocate a row of uv.
+    const int row_uv_size = ((halfwidth + 31) & ~31);
+    align_buffer_64(row_uj, row_uv_size * 2);
+    uint8_t* row_vj = row_uj + row_uv_size;
+#else
+    // Allocate row of uv and 2 rows of ARGB.
+    const int row_size = ((width * 4 + 31) & ~31);
+    const int row_uv_size = ((halfwidth + 31) & ~31);
+    align_buffer_64(row_uj, row_uv_size * 2 + row_size * 2);
+    uint8_t* row_vj = row_uj + row_uv_size;
+    uint8_t* row = row_vj + row_uv_size;
+#endif
+    if (!row_uj)
      return 1;
-    }

    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYJROW)
+      RAWToUVJRow(src_raw, src_stride_raw, row_uj, row_vj, width);
+      MergeUVRow(row_vj, row_uj, dst_vu, halfwidth);
+      RAWToYJRow(src_raw, dst_y, width);
+      RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
      RAWToARGBRow(src_raw, row, width);
      RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
-      ARGBToUVMatrixRow(row, row_size, row_u, row_v, width, argbconstants);
-      MergeUVRow(row_v, row_u, dst_vu, halfwidth);
-      ARGBToYMatrixRow(row, dst_y, width, argbconstants);
-      ARGBToYMatrixRow(row + row_size, dst_y + dst_stride_y, width, argbconstants);
+      ARGBToUVJRow(row, row_size, row_uj, row_vj, width);
+      MergeUVRow(row_vj, row_uj, dst_vu, halfwidth);
+      ARGBToYJRow(row, dst_y, width);
+      ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
+#endif
      src_raw += src_stride_raw * 2;
      dst_y += dst_stride_y * 2;
      dst_vu += dst_stride_vu;
    }
    if (height & 1) {
+#if defined(HAS_RAWTOYJROW)
+      RAWToUVJRow(src_raw, 0, row_uj, row_vj, width);
+      MergeUVRow(row_vj, row_uj, dst_vu, halfwidth);
+      RAWToYJRow(src_raw, dst_y, width);
+#else
      RAWToARGBRow(src_raw, row, width);
-      ARGBToUVMatrixRow(row, 0, row_u, row_v, width, argbconstants);
-      MergeUVRow(row_v, row_u, dst_vu, halfwidth);
-      ARGBToYMatrixRow(row, dst_y, width, argbconstants);
+      ARGBToUVJRow(row, 0, row_uj, row_vj, width);
+      MergeUVRow(row_vj, row_uj, dst_vu, halfwidth);
+      ARGBToYJRow(row, dst_y, width);
+#endif
    }
-    free_aligned_buffer_64(row_v);
-    free_aligned_buffer_64(row_u);
-    free_aligned_buffer_64(row);
+    free_aligned_buffer_64(row_uj);
  }
  return 0;
 }
-
-LIBYUV_API
-int RAWToJNV21(const uint8_t* src_raw,
-               int src_stride_raw,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_vu,
-               int dst_stride_vu,
-               int width,
-               int height) {
-  return RAWToNV21Matrix(src_raw, src_stride_raw, dst_y, dst_stride_y, dst_vu,
-                         dst_stride_vu, &kArgbJPEGConstants, width, height);
-}
-
-LIBYUV_API
-int RAWToNV21(const uint8_t* src_raw,
-              int src_stride_raw,
-              uint8_t* dst_y,
-              int dst_stride_y,
-              uint8_t* dst_vu,
-              int dst_stride_vu,
-              int width,
-              int height) {
-  return RAWToNV21Matrix(src_raw, src_stride_raw, dst_y, dst_stride_y, dst_vu,
-                         dst_stride_vu, &kArgbI601Constants, width, height);
-}
-
-LIBYUV_API
-int RGB24ToNV12(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_y,
-                int dst_stride_y,
-                uint8_t* dst_uv,
-                int dst_stride_uv,
-                int width,
-                int height) {
-  return RAWToNV21Matrix(src_rgb24, src_stride_rgb24, dst_y, dst_stride_y,
-                         dst_uv, dst_stride_uv, &kAbgrI601Constants, width,
-                         height);
-}
-
+#undef HAS_RAWTOYJROW

 #ifdef __cplusplus
 }  // extern "C"
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -693,7 +693,7 @@ void MergeUVPlane(const uint8_t* src_u,
 #if defined(HAS_MERGEUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    MergeUVRow = MergeUVRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
+    if (IS_ALIGNED(width, 16)) {
      MergeUVRow = MergeUVRow_AVX2;
    }
  }
--- a/source/rotate_gcc.cc
+++ b/source/rotate_gcc.cc
@ -101,11 +101,11 @@ void TransposeWx8_SSSE3(const uint8_t* src,
      "movq        %%xmm7,(%1,%4)                \n"
      "lea         (%1,%4,2),%1                  \n"
      "jg          1b                            \n"
-      : "+r"(src),                     // %0
-        "+r"(dst),                     // %1
-        "+r"(width)                    // %2
-      : "r"((ptrdiff_t)(src_stride)),  // %3
-        "r"((ptrdiff_t)(dst_stride))   // %4
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
@ -243,11 +243,11 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
      "movq        %%xmm15,(%1,%4)               \n"
      "lea         (%1,%4,2),%1                  \n"
      "jg          1b                            \n"
-      : "+r"(src),                     // %0
-        "+r"(dst),                     // %1
-        "+r"(width)                    // %2
-      : "r"((ptrdiff_t)(src_stride)),  // %3
-        "r"((ptrdiff_t)(dst_stride))   // %4
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
        "xmm15");
@ -356,13 +356,13 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
      "movhpd      %%xmm8,(%2,%6)                \n"
      "lea         (%2,%6,2),%2                  \n"
      "jg          1b                            \n"
-      : "+r"(src),                       // %0
-        "+r"(dst_a),                     // %1
-        "+r"(dst_b),                     // %2
-        "+r"(width)                      // %3
-      : "r"((ptrdiff_t)(src_stride)),    // %4
-        "r"((ptrdiff_t)(dst_stride_a)),  // %5
-        "r"((ptrdiff_t)(dst_stride_b))   // %6
+      : "+r"(src),                      // %0
+        "+r"(dst_a),                    // %1
+        "+r"(dst_b),                    // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride)),    // %4
+        "r"((intptr_t)(dst_stride_a)),  // %5
+        "r"((intptr_t)(dst_stride_b))   // %6
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7", "xmm8", "xmm9");
 }
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -616,7 +616,7 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON,
 ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
 #endif
 #ifdef HAS_MERGEUVROW_AVX2
-ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 15)
 #endif
 #ifdef HAS_MERGEUVROW_AVX512BW
 ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31)
@ -1000,12 +1000,6 @@ ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
 #if defined(HAS_RAWTOARGBROW_AVX2)
 ANY11(RAWToARGBRow_Any_AVX2, RAWToARGBRow_AVX2, 0, 3, 4, 31)
 #endif
-#if defined(HAS_RAWTOARGBROW_AVX512BW)
-ANY11(RAWToARGBRow_Any_AVX512BW, RAWToARGBRow_AVX512BW, 0, 3, 4, 63)
-#endif
-#if defined(HAS_RGB24TOARGBROW_AVX512BW)
-ANY11(RGB24ToARGBRow_Any_AVX512BW, RGB24ToARGBRow_AVX512BW, 0, 3, 4, 63)
-#endif
 #if defined(HAS_RAWTORGBAROW_SSSE3)
 ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15)
 #endif
@ -1206,36 +1200,52 @@ ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15)
 ANY11(RGBAToYRow_Any_LASX, RGBAToYRow_LASX, 0, 4, 1, 31)
 #endif
 #ifdef HAS_RGB24TOYROW_NEON
+ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15)
 #endif
 #ifdef HAS_RGB24TOYJROW_AVX2
+ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
 #endif
 #ifdef HAS_RGB24TOYJROW_SSSE3
+ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15)
 #endif
 #ifdef HAS_RGB24TOYJROW_NEON
+ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 15)
 #endif
 #ifdef HAS_RGB24TOYROW_LSX
+ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15)
 #endif
 #ifdef HAS_RGB24TOYJROW_LSX
+ANY11(RGB24ToYJRow_Any_LSX, RGB24ToYJRow_LSX, 0, 3, 1, 15)
 #endif
 #ifdef HAS_RGB24TOYJROW_LASX
+ANY11(RGB24ToYJRow_Any_LASX, RGB24ToYJRow_LASX, 0, 3, 1, 31)
 #endif
 #ifdef HAS_RGB24TOYROW_LASX
+ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31)
 #endif
 #ifdef HAS_RAWTOYROW_NEON
+ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 15)
 #endif
 #ifdef HAS_RAWTOYJROW_AVX2
+ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
 #endif
 #ifdef HAS_RAWTOYJROW_SSSE3
+ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15)
 #endif
 #ifdef HAS_RAWTOYJROW_NEON
+ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 15)
 #endif
 #ifdef HAS_RAWTOYROW_LSX
+ANY11(RAWToYRow_Any_LSX, RAWToYRow_LSX, 0, 3, 1, 15)
 #endif
 #ifdef HAS_RAWTOYROW_LASX
+ANY11(RAWToYRow_Any_LASX, RAWToYRow_LASX, 0, 3, 1, 31)
 #endif
 #ifdef HAS_RAWTOYJROW_LSX
+ANY11(RAWToYJRow_Any_LSX, RAWToYJRow_LSX, 0, 3, 1, 15)
 #endif
 #ifdef HAS_RAWTOYJROW_LASX
+ANY11(RAWToYJRow_Any_LASX, RAWToYJRow_LASX, 0, 3, 1, 31)
 #endif
 #ifdef HAS_RGB565TOYROW_NEON
 ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 15)
@ -2264,12 +2274,6 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
    memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1));                         \
  }

-#ifdef HAS_ARGBTOUVMATRIXROW_NEON
-ANY12MS(ARGBToUVMatrixRow_Any_NEON, ARGBToUVMatrixRow_NEON, 0, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVMATRIXROW_NEON_I8MM
-ANY12MS(ARGBToUVMatrixRow_Any_NEON_I8MM, ARGBToUVMatrixRow_NEON_I8MM, 0, 4, 15)
-#endif
 #ifdef HAS_ARGBTOUVMATRIXROW_AVX2
 ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15)
 #endif
@ -2320,18 +2324,6 @@ ANY11MC(ARGBToYMatrixRow_Any_AVX512BW, ARGBToYMatrixRow_AVX512BW, 4, 63)
 #ifdef HAS_ARGBTOYMATRIXROW_NEON
 ANY11MC(ARGBToYMatrixRow_Any_NEON, ARGBToYMatrixRow_NEON, 4, 15)
 #endif
-#ifdef HAS_ARGBTOYMATRIXROW_NEON_DOTPROD
-ANY11MC(ARGBToYMatrixRow_Any_NEON_DotProd, ARGBToYMatrixRow_NEON_DotProd, 4, 15)
-#endif
-#ifdef HAS_ARGBTOYMATRIXROW_LSX
-ANY11MC(ARGBToYMatrixRow_Any_LSX, ARGBToYMatrixRow_LSX, 4, 15)
-#endif
-#ifdef HAS_ARGBTOYMATRIXROW_LASX
-ANY11MC(ARGBToYMatrixRow_Any_LASX, ARGBToYMatrixRow_LASX, 4, 31)
-#endif
-#ifdef HAS_ARGBTOYMATRIXROW_RVV
-ANY11MC(ARGBToYMatrixRow_Any_RVV, ARGBToYMatrixRow_RVV, 4, 15)
-#endif
 #undef ANY11MC

 #ifdef HAS_ARGBTOUVROW_AVX2
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -678,6 +678,8 @@ MAKEROWY(ARGB, 2, 1, 0, 4)
 MAKEROWY(BGRA, 1, 2, 3, 4)
 MAKEROWY(ABGR, 0, 1, 2, 4)
 MAKEROWY(RGBA, 3, 2, 1, 4)
+MAKEROWY(RGB24, 2, 1, 0, 3)
+MAKEROWY(RAW, 0, 1, 2, 3)
 #undef MAKEROWY

 // JPeg uses BT.601-1 full range
@ -751,6 +753,8 @@ static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
 MAKEROWYJ(ARGB, 2, 1, 0, 4)
 MAKEROWYJ(ABGR, 0, 1, 2, 4)
 MAKEROWYJ(RGBA, 3, 2, 1, 4)
+MAKEROWYJ(RGB24, 2, 1, 0, 3)
+MAKEROWYJ(RAW, 0, 1, 2, 3)
 #undef MAKEROWYJ

 static __inline uint8_t RGBToYMatrix(uint8_t r,
@ -4375,21 +4379,69 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,

 #ifdef HAS_RGB24TOYJROW_AVX2
 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+    ARGBToYJRow_AVX2(row, dst_yj, twidth);
+    src_rgb24 += twidth * 3;
+    dst_yj += twidth;
+    width -= twidth;
+  }
 }
 #endif  // HAS_RGB24TOYJROW_AVX2

 #ifdef HAS_RAWTOYJROW_AVX2
 // Convert 32 RAW pixels (128 bytes) to 32 YJ values.
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+#ifdef HAS_RAWTOARGBROW_AVX2
+    RAWToARGBRow_AVX2(src_raw, row, twidth);
+#else
+    RAWToARGBRow_SSSE3(src_raw, row, twidth);
+#endif
+    ARGBToYJRow_AVX2(row, dst_yj, twidth);
+    src_raw += twidth * 3;
+    dst_yj += twidth;
+    width -= twidth;
+  }
 }
 #endif  // HAS_RAWTOYJROW_AVX2

 #ifdef HAS_RGB24TOYJROW_SSSE3
 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+    ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+    src_rgb24 += twidth * 3;
+    dst_yj += twidth;
+    width -= twidth;
+  }
 }
 #endif  // HAS_RGB24TOYJROW_SSSE3

 #ifdef HAS_RAWTOYJROW_SSSE3
 // Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    RAWToARGBRow_SSSE3(src_raw, row, twidth);
+    ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+    src_raw += twidth * 3;
+    dst_yj += twidth;
+    width -= twidth;
+  }
 }
 #endif  // HAS_RAWTOYJROW_SSSE3

--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -262,64 +262,6 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }

-#ifdef HAS_RAWTOARGBROW_AVX512BW
-static const uint32_t kPermdRAWToARGB_AVX512BW[16] = {
-    0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
-
-void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint32_t* shuffler, int width) {
-  asm volatile(
-      "vpternlogd  $0xff,%%zmm6,%%zmm6,%%zmm6    \n"  // 0xffffffff
-      "vpslld      $0x18,%%zmm6,%%zmm6           \n"  // 0xff000000
-      "movabs      $0xffffffffffff,%%rax         \n"  // 48 bytes mask
-      "kmovq       %%rax,%%k1                    \n"
-      "vmovdqu32   %3,%%zmm5                     \n"
-      "vbroadcasti32x4 %4,%%zmm4                 \n"
-
-      LABELALIGN  //
-      "1:          \n"
-      "vmovdqu8    (%0),%%zmm0%{%%k1%}%{z%}      \n"
-      "vmovdqu8    48(%0),%%zmm1%{%%k1%}%{z%}    \n"
-      "vmovdqu8    96(%0),%%zmm2%{%%k1%}%{z%}    \n"
-      "vmovdqu8    144(%0),%%zmm3%{%%k1%}%{z%}   \n"
-      "lea         192(%0),%0                    \n"
-      "vpermd      %%zmm0,%%zmm5,%%zmm0          \n"
-      "vpermd      %%zmm1,%%zmm5,%%zmm1          \n"
-      "vpermd      %%zmm2,%%zmm5,%%zmm2          \n"
-      "vpermd      %%zmm3,%%zmm5,%%zmm3          \n"
-      "vpshufb     %%zmm4,%%zmm0,%%zmm0          \n"
-      "vpshufb     %%zmm4,%%zmm1,%%zmm1          \n"
-      "vpshufb     %%zmm4,%%zmm2,%%zmm2          \n"
-      "vpshufb     %%zmm4,%%zmm3,%%zmm3          \n"
-      "vpord       %%zmm6,%%zmm0,%%zmm0          \n"
-      "vpord       %%zmm6,%%zmm1,%%zmm1          \n"
-      "vpord       %%zmm6,%%zmm2,%%zmm2          \n"
-      "vpord       %%zmm6,%%zmm3,%%zmm3          \n"
-      "vmovdqu32   %%zmm0,(%1)                   \n"
-      "vmovdqu32   %%zmm1,0x40(%1)               \n"
-      "vmovdqu32   %%zmm2,0x80(%1)               \n"
-      "vmovdqu32   %%zmm3,0xc0(%1)               \n"
-      "lea         0x100(%1),%1                  \n"
-      "sub         $0x40,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
-      : "+r"(src_raw),                  // %0
-        "+r"(dst_argb),                 // %1
-        "+r"(width)                     // %2
-      : "m"(kPermdRAWToARGB_AVX512BW),  // %3
-        "m"(*shuffler)                  // %4
-      : "memory", "cc", "rax", "k1", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6");
-}
-
-void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  RGBToARGBRow_AVX512BW(src_raw, dst_argb, (const uint32_t*)&kShuffleMaskRAWToARGB, width);
-}
-
-void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
-  RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, (const uint32_t*)&kShuffleMaskRGB24ToARGB, width);
-}
-#endif
-
-
 // Same code as RAWToARGB with different shuffler and A in low bits
 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
  asm volatile(
@ -1913,9 +1855,9 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
 #else
        "+rm"(width)  // %3
 #endif
-      : "r"((ptrdiff_t)(src_stride_argb)),  // %4
-        "r"(c),                             // %5
-        "m"(kShuffleAARRGGBB)               // %6
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "r"(c),                            // %5
+        "m"(kShuffleAARRGGBB)              // %6
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
@ -1988,9 +1930,9 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
 #else
        "+rm"(width)  // %3
 #endif
-      : "r"((ptrdiff_t)(src_stride_argb)),  // %4
-        "r"(c),                             // %5
-        "m"(kShuffleAARRGGBB)               // %6
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "r"(c),                            // %5
+        "m"(kShuffleAARRGGBB)              // %6
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
@ -2293,11 +2235,11 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb,
 #else
        "+rm"(width)  // %3
 #endif
-      : "r"((ptrdiff_t)(src_stride_argb)),  // %4
-        "r"(c),                             // %5
-        "m"(kShuffleAARRGGBB),              // %6
-        "m"(kPermdARGBToY_AVX512BW),        // %7
-        "m"(kPermdARGBToUV_AVX512BW)        // %8
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "r"(c),                            // %5
+        "m"(kShuffleAARRGGBB),             // %6
+        "m"(kPermdARGBToY_AVX512BW),       // %7
+        "m"(kPermdARGBToUV_AVX512BW)       // %8
      : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6",
        "zmm7", "zmm16", "zmm17", "zmm18", "zmm19");
 }
@ -4649,7 +4591,7 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};

 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  ptrdiff_t temp_width = (ptrdiff_t)(width);
+  intptr_t temp_width = (intptr_t)(width);
      asm volatile("movdqa      %3,%%xmm5                     \n"

               LABELALIGN
@ -4670,7 +4612,7 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {

 #ifdef HAS_MIRRORROW_AVX2
 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  ptrdiff_t temp_width = (ptrdiff_t)(width);
+  intptr_t temp_width = (intptr_t)(width);
      asm volatile("vbroadcastf128 %3,%%ymm5                  \n"

               LABELALIGN
@ -4697,7 +4639,7 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};

 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  ptrdiff_t temp_width = (ptrdiff_t)(width);
+  intptr_t temp_width = (intptr_t)(width);
      asm volatile("movdqa      %3,%%xmm5                     \n"

               LABELALIGN
@ -4718,7 +4660,7 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {

 #ifdef HAS_MIRRORUVROW_AVX2
 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  ptrdiff_t temp_width = (ptrdiff_t)(width);
+  intptr_t temp_width = (intptr_t)(width);
      asm volatile("vbroadcastf128 %3,%%ymm5                  \n"

               LABELALIGN
@ -4747,7 +4689,7 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ptrdiff_t temp_width = (ptrdiff_t)(width);
+  intptr_t temp_width = (intptr_t)(width);
  asm volatile(
      "movdqa      %4,%%xmm1                     \n"
      "lea         -0x10(%0,%3,2),%0             \n"
@ -4786,7 +4728,7 @@ static const uvec8 kShuffleMirrorRGB1 = {
 void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
                          uint8_t* dst_rgb24,
                          int width) {
-  ptrdiff_t temp_width = (ptrdiff_t)(width);
+  intptr_t temp_width = (intptr_t)(width);
  src_rgb24 += width * 3 - 48;
  asm volatile(
      "movdqa      %3,%%xmm4                     \n"
@ -4822,7 +4764,7 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
 #ifdef HAS_ARGBMIRRORROW_SSE2

 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  ptrdiff_t temp_width = (ptrdiff_t)(width);
+  intptr_t temp_width = (intptr_t)(width);
      asm volatile("lea         -0x10(%0,%2,4),%0             \n"

               LABELALIGN
@ -4846,7 +4788,7 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 // Shuffle table for reversing the bytes.
 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  ptrdiff_t temp_width = (ptrdiff_t)(width);
+  intptr_t temp_width = (intptr_t)(width);
      asm volatile("vmovdqu     %3,%%ymm5                     \n"

               LABELALIGN
@ -6867,10 +6809,10 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
      "lea         0x10(%1),%1                   \n"
      "sub         $0x10,%2                      \n"
      "jg          1b                            \n"
-      : "+r"(src_yuy2),                // %0
-        "+r"(dst_uv),                  // %1
-        "+r"(width)                    // %2
-      : "r"((ptrdiff_t)(stride_yuy2))  // %3
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_uv),                 // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(stride_yuy2))  // %3
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }

@ -6906,11 +6848,11 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
      "lea         0x8(%1),%1                    \n"
      "sub         $0x10,%3                      \n"
      "jg          1b                            \n"
-      : "+r"(src_yuy2),                // %0
-        "+r"(dst_u),                   // %1
-        "+r"(dst_v),                   // %2
-        "+r"(width)                    // %3
-      : "r"((ptrdiff_t)(stride_yuy2))  // %4
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_yuy2))  // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }

@ -7001,11 +6943,11 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
      "lea         0x8(%1),%1                    \n"
      "sub         $0x10,%3                      \n"
      "jg          1b                            \n"
-      : "+r"(src_uyvy),                // %0
-        "+r"(dst_u),                   // %1
-        "+r"(dst_v),                   // %2
-        "+r"(width)                    // %3
-      : "r"((ptrdiff_t)(stride_uyvy))  // %4
+      : "+r"(src_uyvy),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_uyvy))  // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }

@ -7092,10 +7034,10 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
      "sub         $0x20,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_yuy2),                // %0
-        "+r"(dst_uv),                  // %1
-        "+r"(width)                    // %2
-      : "r"((ptrdiff_t)(stride_yuy2))  // %3
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_uv),                 // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(stride_yuy2))  // %3
      : "memory", "cc", "xmm0", "xmm1");
 }

@ -7132,11 +7074,11 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
      "sub         $0x20,%3                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_yuy2),                // %0
-        "+r"(dst_u),                   // %1
-        "+r"(dst_v),                   // %2
-        "+r"(width)                    // %3
-      : "r"((ptrdiff_t)(stride_yuy2))  // %4
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_yuy2))  // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }

@ -7232,11 +7174,11 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
      "sub         $0x20,%3                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_uyvy),                // %0
-        "+r"(dst_u),                   // %1
-        "+r"(dst_v),                   // %2
-        "+r"(width)                    // %3
-      : "r"((ptrdiff_t)(stride_uyvy))  // %4
+      : "+r"(src_uyvy),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_uyvy))  // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }

@ -8596,12 +8538,12 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
      "sub         $0x1,%3                       \n"
      "jge         10b                           \n"
      "19:         \n"
-      : "+r"(topleft),            // %0
-        "+r"(botleft),            // %1
-        "+r"(dst),                // %2
-        "+rm"(count)              // %3
-      : "r"((ptrdiff_t)(width)),  // %4
-        "rm"(area)                // %5
+      : "+r"(topleft),           // %0
+        "+r"(botleft),           // %1
+        "+r"(dst),               // %2
+        "+rm"(count)             // %3
+      : "r"((intptr_t)(width)),  // %4
+        "rm"(area)               // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
@ -8614,7 +8556,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
                        uint8_t* dst_argb,
                        const float* src_dudv,
                        int width) {
-  ptrdiff_t src_argb_stride_temp = src_argb_stride;
+  intptr_t src_argb_stride_temp = src_argb_stride;
  intptr_t temp;
  asm volatile(
      "movq        (%3),%%xmm2                   \n"
@ -8766,11 +8708,11 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr,
      "jg          100b                          \n"

      "99:         \n"
-      : "+r"(dst_ptr),           // %0
-        "+r"(src_ptr),           // %1
-        "+rm"(width),            // %2
-        "+r"(source_y_fraction)  // %3
-      : "r"(src_stride)          // %4
+      : "+r"(dst_ptr),               // %0
+        "+r"(src_ptr),               // %1
+        "+rm"(width),                // %2
+        "+r"(source_y_fraction)      // %3
+      : "r"((intptr_t)(src_stride))  // %4
      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_INTERPOLATEROW_SSSE3
@ -8844,11 +8786,11 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,

      "99:         \n"
      "vzeroupper  \n"
-      : "+r"(dst_ptr),           // %0
-        "+r"(src_ptr),           // %1
-        "+r"(width),             // %2
-        "+r"(source_y_fraction)  // %3
-      : "r"(src_stride)          // %4
+      : "+r"(dst_ptr),               // %0
+        "+r"(src_ptr),               // %1
+        "+r"(width),                 // %2
+        "+r"(source_y_fraction)      // %3
+      : "r"((intptr_t)(src_stride))  // %4
      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
 }
 #endif  // HAS_INTERPOLATEROW_AVX2
@ -9678,12 +9620,12 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
      "lea         0x10(%2),%2                   \n"
      "sub         $0x10,%3                      \n"  // 16 src pixels per loop
      "jg          1b                            \n"
-      : "+r"(src_u),                     // %0
-        "+r"(src_v),                     // %1
-        "+r"(dst_uv),                    // %2
-        "+r"(width)                      // %3
-      : "r"((ptrdiff_t)(src_stride_u)),  // %4
-        "r"((ptrdiff_t)(src_stride_v))   // %5
+      : "+r"(src_u),                    // %0
+        "+r"(src_v),                    // %1
+        "+r"(dst_uv),                   // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride_u)),  // %4
+        "r"((intptr_t)(src_stride_v))   // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }

@ -9724,12 +9666,12 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u,
      "sub         $0x20,%3                      \n"  // 32 src pixels per loop
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_u),                     // %0
-        "+r"(src_v),                     // %1
-        "+r"(dst_uv),                    // %2
-        "+r"(width)                      // %3
-      : "r"((ptrdiff_t)(src_stride_u)),  // %4
-        "r"((ptrdiff_t)(src_stride_v))   // %5
+      : "+r"(src_u),                    // %0
+        "+r"(src_v),                    // %1
+        "+r"(dst_uv),                   // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride_u)),  // %4
+        "r"((intptr_t)(src_stride_v))   // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }

--- a/source/row_lasx.cc
+++ b/source/row_lasx.cc
@ -2013,24 +2013,24 @@ void NV21ToARGBRow_LASX(const uint8_t* src_y,
  }
 }

-#ifndef ArgbConstants
-struct ArgbConstants {
+#ifndef RgbConstants
+struct RgbConstants {
  uint8_t kRGBToY[4];
  uint16_t kAddY;
  uint16_t pad;
 };
-#define ArgbConstants ArgbConstants
+#define RgbConstants RgbConstants

 // RGB to JPeg coefficients
 // B * 0.1140 coefficient = 29
 // G * 0.5870 coefficient = 150
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
-static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
                                                        128,
                                                        0};

-static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};

 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@ -2038,20 +2038,20 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}
 // R * 0.2578 coefficient = 66
 // Add 16.5 = 0x1080

-static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
                                                        0x1080,
                                                        0};

-static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0},
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
                                                      0x1080,
                                                      0};
-#endif  // ArgbConstants
+#endif  // RgbConstants

 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
-void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
+static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
                                  uint8_t* dst_y,
                                  int width,
-                                  const struct ArgbConstants* c) {
+                                  const struct RgbConstants* rgbconstants) {
  int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
  asm volatile(
      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
@ -2088,7 +2088,7 @@ void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
      : "+&r"(src_argb),  // %0
        "+&r"(dst_y),     // %1
        "+&r"(width)      // %2
-      : "r"(c), "r"(shuff)
+      : "r"(rgbconstants), "r"(shuff)
      : "memory");
 }

@ -2113,7 +2113,7 @@ void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
 static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
                                  uint8_t* dst_y,
                                  int width,
-                                  const struct ArgbConstants* c) {
+                                  const struct RgbConstants* rgbconstants) {
  int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
  asm volatile(
      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
@ -2150,7 +2150,7 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
      : "+&r"(src_rgba),  // %0
        "+&r"(dst_y),     // %1
        "+&r"(width)      // %2
-      : "r"(c), "r"(shuff)
+      : "r"(rgbconstants), "r"(shuff)
      : "memory");
 }

@ -2169,7 +2169,7 @@ void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
 static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
                                 uint8_t* dst_y,
                                 int width,
-                                 const struct ArgbConstants* c) {
+                                 const struct RgbConstants* rgbconstants) {
  int8_t shuff[128] = {
      0,  2,  3,  5,  6,  8, 9,  11, 12, 14, 15, 17, 18, 20, 21, 23,
      0,  2,  3,  5,  6,  8, 9,  11, 12, 14, 15, 17, 18, 20, 21, 23,
@ -2219,14 +2219,26 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
      : "+&r"(src_rgba),    // %0
        "+&r"(dst_y),       // %1
        "+&r"(width)        // %2
-      : "r"(c),  // %3
+      : "r"(rgbconstants),  // %3
        "r"(shuff)          // %4
      : "memory");
 }

+void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LASX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}

+void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LASX(src_raw, dst_yj, width, &kRawJPEGConstants);
+}

+void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LASX(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}

+void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LASX(src_raw, dst_y, width, &kRawI601Constants);
+}

 void ARGBToUVJRow_LASX(const uint8_t* src_argb,
                       int src_stride_argb,
--- a/source/row_lsx.cc
+++ b/source/row_lsx.cc
@ -2798,24 +2798,24 @@ void HalfFloatRow_LSX(const uint16_t* src,
  }
 }

-#ifndef ArgbConstants
-struct ArgbConstants {
+#ifndef RgbConstants
+struct RgbConstants {
  uint8_t kRGBToY[4];
  uint16_t kAddY;
  uint16_t pad;
 };
-#define ArgbConstants ArgbConstants
+#define RgbConstants RgbConstants

 // RGB to JPeg coefficients
 // B * 0.1140 coefficient = 29
 // G * 0.5870 coefficient = 150
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
-static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
                                                        128,
                                                        0};

-static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};

 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@ -2823,20 +2823,20 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}
 // R * 0.2578 coefficient = 66
 // Add 16.5 = 0x1080

-static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
                                                        0x1080,
                                                        0};

-static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0},
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
                                                      0x1080,
                                                      0};
-#endif  // ArgbConstants
+#endif  // RgbConstants

 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
-void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
+static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
                                 uint8_t* dst_y,
                                 int width,
-                                 const struct ArgbConstants* c) {
+                                 const struct RgbConstants* rgbconstants) {
  asm volatile(
      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
@ -2870,7 +2870,7 @@ void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
      : "+&r"(src_argb),  // %0
        "+&r"(dst_y),     // %1
        "+&r"(width)      // %2
-      : "r"(c)
+      : "r"(rgbconstants)
      : "memory");
 }

@ -2895,7 +2895,7 @@ void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
 static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
                                 uint8_t* dst_y,
                                 int width,
-                                 const struct ArgbConstants* c) {
+                                 const struct RgbConstants* rgbconstants) {
  asm volatile(
      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
@ -2929,7 +2929,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
      : "+&r"(src_rgba),  // %0
        "+&r"(dst_y),     // %1
        "+&r"(width)      // %2
-      : "r"(c)
+      : "r"(rgbconstants)
      : "memory");
 }

@ -2948,7 +2948,7 @@ void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
 static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
                                uint8_t* dst_y,
                                int width,
-                                const struct ArgbConstants* c) {
+                                const struct RgbConstants* rgbconstants) {
  int8_t shuff[64] = {0,  2,  3,  5,  6,  8,  9,  11, 12, 14, 15, 17, 18,
                      20, 21, 23, 24, 26, 27, 29, 30, 0,  1,  3,  4,  6,
                      7,  9,  10, 12, 13, 15, 1,  0,  4,  0,  7,  0,  10,
@ -2990,14 +2990,26 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
      : "+&r"(src_rgba),    // %0
        "+&r"(dst_y),       // %1
        "+&r"(width)        // %2
-      : "r"(c),  // %3
+      : "r"(rgbconstants),  // %3
        "r"(shuff)          // %4
      : "memory");
 }

+void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LSX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}

+void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_LSX(src_raw, dst_yj, width, &kRawJPEGConstants);
+}

+void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LSX(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}

+void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_LSX(src_raw, dst_y, width, &kRawI601Constants);
+}

 // undef for unified sources build
 #undef YUVTORGB_SETUP
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -1918,72 +1918,6 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
 // clang-format on

 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
-                            int src_stride_argb,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width,
-                            const struct ArgbConstants* c) {
-  asm volatile (
-      "add         %1, %0, %1                    \n"  // src_stride + src_argb
-      "vld1.8      {d18}, [%5]                   \n"  // load kRGBToU
-      "vld1.8      {d19}, [%6]                   \n"  // load kRGBToV
-      "vmovl.s8    q8, d18                       \n"  // U coeffs in q8 (d16, d17)
-      "vmovl.s8    q9, d19                       \n"  // V coeffs in q9 (d18, d19)
-      "vdup.16     q10, d16[0]                   \n"  // U0
-      "vdup.16     q11, d16[1]                   \n"  // U1
-      "vdup.16     q12, d16[2]                   \n"  // U2
-      "vdup.16     q13, d18[0]                   \n"  // V0
-      "vdup.16     q14, d18[1]                   \n"  // V1
-      "vdup.16     q15, d18[2]                   \n"  // V2
-      "1:          \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
-      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
-      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ARGB pixels.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ARGB pixels.
-      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
-
-      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
-      "vrshr.u16   q1, q1, #2                    \n"
-      "vrshr.u16   q2, q2, #2                    \n"
-
-      "vmov.u16    q3, #0x8000                   \n"  // 128.0
-
-      "vmul.s16    q8, q0, q10                   \n"  // U = B * U0
-      "vmla.s16    q8, q1, q11                   \n"  // U += G * U1
-      "vmla.s16    q8, q2, q12                   \n"  // U += R * U2
-
-      "vmul.s16    q9, q0, q13                   \n"  // V = B * V0
-      "vmla.s16    q9, q1, q14                   \n"  // V += G * V1
-      "vmla.s16    q9, q2, q15                   \n"  // V += R * V2
-
-      "vsub.u16    q8, q3, q8                    \n"  // 128.0 - U
-      "vsub.u16    q9, q3, q9                    \n"  // 128.0 - V
-
-      "vqshrn.u16  d0, q8, #8                    \n"  // Saturating shift right
-      "vqshrn.u16  d1, q9, #8                    \n"
-
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_stride_argb),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  : "r"(&c->kRGBToU),  // %5
-    "r"(&c->kRGBToV)   // %6
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
 void ARGBToUVRow_NEON(const uint8_t* src_argb,
                      int src_stride_argb,
                      uint8_t* dst_u,
@ -2896,7 +2830,7 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
  RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kAbgrI601Constants);
 }

-void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
+static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                                 uint8_t* dst_y,
                                 int width,
                                 const struct ArgbConstants* c) {
@ -2931,9 +2865,21 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
        "q12");
 }

+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kArgbJPEGConstants);
+}

+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kAbgrJPEGConstants);
+}

+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kArgbI601Constants);
+}

+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kAbgrI601Constants);
+}

 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_NEON(uint8_t* dst_ptr,
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -9,7 +9,6 @@
 */

 #include "libyuv/row.h"
-#include "libyuv/convert_from_argb.h"

 #ifdef __cplusplus
 namespace libyuv {
@ -2894,26 +2893,14 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
 // TODO(fbarchard): consider ptrdiff_t for all strides.

-void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
-                            int src_stride_argb,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width,
-                            const struct ArgbConstants* c) {
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
  asm volatile (
-      "ldr        q16, [%[c], #16]               \n" // kRGBToU
-      "ldr        q17, [%[c], #32]               \n" // kRGBToV
-      "sxtl       v16.8h, v16.8b                 \n" // sign extend U coeffs to 16-bit
-      "sxtl       v17.8h, v17.8b                 \n" // sign extend V coeffs to 16-bit
-      "dup        v20.8h, v16.h[0]               \n" // U0 (-BU)
-      "dup        v21.8h, v16.h[1]               \n" // U1 (-GU)
-      "dup        v22.8h, v16.h[2]               \n" // U2 (-RU)
-      "dup        v23.8h, v17.h[0]               \n" // V0 (-BV)
-      "dup        v24.8h, v17.h[1]               \n" // V1 (-GV)
-      "dup        v26.8h, v17.h[2]               \n" // V2 (-RV)
-      "movi       v25.8h, #0x80, lsl #8          \n" // 128.0 in 16-bit (0x8000)
-
+    RGBTOUV_SETUP_REG
      "1:          \n"
      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
@ -2922,7 +2909,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.

-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%1, 448]          \n"
      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
@ -2932,20 +2919,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
      "urshr       v1.8h, v1.8h, #2              \n"
      "urshr       v2.8h, v2.8h, #2              \n"

-      // U = B*U0 + G*U1 + R*U2
-      "mul        v3.8h, v0.8h, v20.8h          \n"
-      "mla        v3.8h, v1.8h, v21.8h          \n"
-      "mla        v3.8h, v2.8h, v22.8h          \n"
-
-      // V = B*V0 + G*V1 + R*V2
-      "mul        v4.8h, v0.8h, v23.8h          \n"
-      "mla        v4.8h, v1.8h, v24.8h          \n"
-      "mla        v4.8h, v2.8h, v26.8h          \n"
-
-      // U = (128.0 - U) >> 8, V = (128.0 - V) >> 8
-      "subhn      v0.8b, v25.8h, v3.8h           \n"
-      "subhn      v1.8b, v25.8h, v4.8h           \n"
-
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
      "b.gt        1b                            \n"
@ -2954,21 +2928,12 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(width)        // %4
-  : [c] "r"(c)         // %5
+  :
  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26"
+    "v20", "v21", "v22", "v23", "v24", "v25"
  );
 }

-void ARGBToUVRow_NEON(const uint8_t* src_argb,
-                      int src_stride_argb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width,
-                         &kArgbI601Constants);
-}
-
 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                       int src_stride_argb,
                       uint8_t* dst_u,
@ -3484,7 +3449,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
 }

 // Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout.
-static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
+static void ABCDToUVMatrixRow_NEON_I8MM(const uint8_t* src,
                                        int src_stride,
                                        uint8_t* dst_u,
                                        uint8_t* dst_v,
@ -3581,25 +3546,12 @@ static const int8_t kRGBAToUVCoefficients[] = {
    0, -112, 74, 38, 0, 18, 94, -112,
 };

-void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
-                                 int src_stride_argb,
-                                 uint8_t* dst_u,
-                                 uint8_t* dst_v,
-                                 int width,
-                                 const struct ArgbConstants* c) {
-  int8_t uvconstants[8] = {
-      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
-      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
-                                   uvconstants);
-}
-
 void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
                           int src_stride_argb,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
+  ABCDToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width,
                              kARGBToUVCoefficients);
 }

@ -3608,7 +3560,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
+  ABCDToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width,
                              kABGRToUVCoefficients);
 }

@ -3617,7 +3569,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width,
+  ABCDToUVMatrixRow_NEON_I8MM(src_bgra, src_stride_bgra, dst_u, dst_v, width,
                              kBGRAToUVCoefficients);
 }

@ -3626,7 +3578,7 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width,
+  ABCDToUVMatrixRow_NEON_I8MM(src_rgba, src_stride_rgba, dst_u, dst_v, width,
                              kRGBAToUVCoefficients);
 }

@ -3654,7 +3606,7 @@ void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
+  ABCDToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width,
                              kARGBToUVJCoefficients);
 }

@ -3663,7 +3615,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
+  ABCDToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width,
                              kABGRToUVJCoefficients);
 }

@ -3763,20 +3715,22 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
      : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
 }

-
+struct RgbConstants {
+  uint8_t kRGBToY[4];
+  uint16_t kAddY;
+};

 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
-void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
+static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
                                  uint8_t* dst_y,
                                  int width,
-                                  const struct ArgbConstants* c) {
+                                  const struct RgbConstants* rgbconstants) {
  asm volatile(
-      "ldr         s0, [%3]                      \n"  // load rgbconstants
-      "ldr         s1, [%3, #48]                 \n"
+      "ldr         d0, [%3]                      \n"  // load rgbconstants
      "dup         v6.16b, v0.b[0]               \n"
      "dup         v7.16b, v0.b[1]               \n"
      "dup         v16.16b, v0.b[2]              \n"
-      "dup         v17.8h,  v1.h[0]              \n"
+      "dup         v17.8h,  v0.h[2]              \n"
      "1:          \n"
      "ld4         {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n"  // load 16
                                                                 // pixels.
@ -3795,21 +3749,20 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
      : "+r"(src_argb),    // %0
        "+r"(dst_y),       // %1
        "+r"(width)        // %2
-      : "r"(c)  // %3
+      : "r"(rgbconstants)  // %3
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
        "v17");
 }

-void ARGBToYMatrixRow_NEON_DotProd(
+static void ARGBToYMatrixRow_NEON_DotProd(
    const uint8_t* src_argb,
    uint8_t* dst_y,
    int width,
-    const struct ArgbConstants* c) {
+    const struct RgbConstants* rgbconstants) {
  asm volatile(
-      "ldr         s0, [%3]                      \n"  // load rgbconstants
-      "ldr         s1, [%3, #48]                 \n"
+      "ldr         d0, [%3]                      \n"  // load rgbconstants
      "dup         v16.4s, v0.s[0]               \n"
-      "dup         v17.8h,  v1.h[0]              \n"
+      "dup         v17.8h,  v0.h[2]              \n"
      "1:          \n"
      "ld1         {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n"  // load 16
                                                                    // pixels.
@ -3831,7 +3784,7 @@ void ARGBToYMatrixRow_NEON_DotProd(
      : "+r"(src_argb),    // %0
        "+r"(dst_y),       // %1
        "+r"(width)        // %2
-      : "r"(c)  // %3
+      : "r"(rgbconstants)  // %3
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
        "v17");
 }
@ -3841,10 +3794,12 @@ void ARGBToYMatrixRow_NEON_DotProd(
 // G * 0.5870 coefficient = 150
 // R * 0.2990 coefficient = 77
 // Add 0.5
-static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {}, {}, {0x0080}, {}};
-static const struct ArgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, {}, {}, {0x0080}, {}};
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+                                                        0x0080};
+static const struct RgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77},
+                                                               0x0080};

-static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {}, {}, {0x0080}, {}};
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 0x0080};

 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@ -3852,11 +3807,14 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {}, {},
 // R * 0.2578 coefficient = 66
 // Add 16.5 = 0x1080

-static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {}, {}, {0x1080}, {}};
-static const struct ArgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66}, {}, {}, {0x1080}, {}};
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                        0x1080};
+static const struct RgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66},
+                                                               0x1080};

-static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {}, {}, {0x1080}, {}};
-static const struct ArgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25}, {}, {}, {0x1080}, {}};
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
+static const struct RgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25},
+                                                             0x1080};

 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
  ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
@ -3903,14 +3861,13 @@ void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr,
 static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
                                  uint8_t* dst_y,
                                  int width,
-                                  const struct ArgbConstants* c) {
+                                  const struct RgbConstants* rgbconstants) {
  asm volatile(
-      "ldr         s0, [%3]                      \n"  // load rgbconstants
-      "ldr         s1, [%3, #48]                 \n"
+      "ldr         d0, [%3]                      \n"  // load rgbconstants
      "dup         v6.16b, v0.b[0]               \n"
      "dup         v7.16b, v0.b[1]               \n"
      "dup         v16.16b, v0.b[2]              \n"
-      "dup         v17.8h,  v1.h[0]              \n"
+      "dup         v17.8h,  v0.h[2]              \n"
      "1:          \n"
      "ld4         {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n"  // load 16
                                                                 // pixels.
@ -3929,7 +3886,7 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
      : "+r"(src_rgba),    // %0
        "+r"(dst_y),       // %1
        "+r"(width)        // %2
-      : "r"(c)  // %3
+      : "r"(rgbconstants)  // %3
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
        "v17");
 }
@ -3973,10 +3930,10 @@ void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra,
                                &kRawI601DotProdConstants);
 }

-void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
+static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                                 uint8_t* dst_y,
                                 int width,
-                                 const struct ArgbConstants* c) {
+                                 const struct RgbConstants* rgbconstants) {
  asm volatile(
      "ldr         d0, [%3]                      \n"  // load rgbconstants
      "dup         v5.16b, v0.b[0]               \n"
@ -4000,13 +3957,25 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
      : "+r"(src_rgb),     // %0
        "+r"(dst_y),       // %1
        "+r"(width)        // %2
-      : "r"(c)  // %3
+      : "r"(rgbconstants)  // %3
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
 }

+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}

+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+  RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
+}

+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
+}

+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
+}

 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_NEON(uint8_t* dst_ptr,
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@ -1120,20 +1120,6 @@ __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y,
      : "cc", "memory", "z0", "z1", "z2", "p0", "p1");
 }

-__arm_locally_streaming void ARGBToUVMatrixRow_SME(
-    const uint8_t* src_argb,
-    int src_stride_argb,
-    uint8_t* dst_u,
-    uint8_t* dst_v,
-    int width,
-    const struct ArgbConstants* c) {
-  int8_t uvconstants[8] = {
-      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
-      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
-  ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
-                           uvconstants);
-}
-
 __arm_locally_streaming void ARGBToUVRow_SME(const uint8_t* src_argb,
                                             int src_stride_argb,
                                             uint8_t* dst_u,
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@ -217,19 +217,6 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
  NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
 }

-void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
-                            int src_stride_argb,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width,
-                            const struct ArgbConstants* c) {
-  int8_t uvconstants[8] = {
-      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
-      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
-  ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
-                           uvconstants);
-}
-
 void ARGBToUVRow_SVE2(const uint8_t* src_argb,
                      int src_stride_argb,
                      uint8_t* dst_u,
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -122,10 +122,8 @@ extern "C" {

 #if defined(__clang__) || defined(__GNUC__)
 #define LIBYUV_TARGET_AVX2 __attribute__((target("avx2")))
-#define LIBYUV_TARGET_AVX512BW __attribute__((target("avx512bw,avx512vl,avx512f")))
 #else
 #define LIBYUV_TARGET_AVX2
-#define LIBYUV_TARGET_AVX512BW
 #endif

 LIBYUV_TARGET_AVX2
@ -212,197 +210,6 @@ LIBYUV_TARGET_AVX2
 void BGRAToYRow_AVX2(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
  ARGBToYMatrixRow_AVX2(src_bgra, dst_y, width, &kBgraI601Constants);
 }
-
-#ifdef HAS_RAWTOARGBROW_AVX2
-LIBYUV_TARGET_AVX2
-void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  __m256i ymm_alpha = _mm256_set1_epi32(0xff000000);
-  __m128i shuf_low = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
-  __m128i shuf_high = _mm_set_epi8(-1, 13, 14, 15, -1, 10, 11, 12, -1, 7, 8, 9, -1, 4, 5, 6);
-  __m256i ymm_shuf = _mm256_broadcastsi128_si256(shuf_low);
-  __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(shuf_high);
-
-  while (width > 0) {
-    __m128i xmm0 = _mm_loadu_si128((const __m128i*)src_raw);
-    __m256i ymm0 = _mm256_castsi128_si256(xmm0);
-    ymm0 = _mm256_inserti128_si256(ymm0, _mm_loadu_si128((const __m128i*)(src_raw + 12)), 1);
-
-    __m128i xmm1 = _mm_loadu_si128((const __m128i*)(src_raw + 24));
-    __m256i ymm1 = _mm256_castsi128_si256(xmm1);
-    ymm1 = _mm256_inserti128_si256(ymm1, _mm_loadu_si128((const __m128i*)(src_raw + 36)), 1);
-
-    __m128i xmm2 = _mm_loadu_si128((const __m128i*)(src_raw + 48));
-    __m256i ymm2 = _mm256_castsi128_si256(xmm2);
-    ymm2 = _mm256_inserti128_si256(ymm2, _mm_loadu_si128((const __m128i*)(src_raw + 60)), 1);
-
-    __m128i xmm3 = _mm_loadu_si128((const __m128i*)(src_raw + 68));
-    __m256i ymm3 = _mm256_castsi128_si256(xmm3);
-    ymm3 = _mm256_inserti128_si256(ymm3, _mm_loadu_si128((const __m128i*)(src_raw + 80)), 1);
-
-    ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
-    ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
-    ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf);
-    ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf2);
-
-    ymm0 = _mm256_or_si256(ymm0, ymm_alpha);
-    ymm1 = _mm256_or_si256(ymm1, ymm_alpha);
-    ymm2 = _mm256_or_si256(ymm2, ymm_alpha);
-    ymm3 = _mm256_or_si256(ymm3, ymm_alpha);
-
-    _mm256_storeu_si256((__m256i*)dst_argb, ymm0);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 64), ymm2);
-    _mm256_storeu_si256((__m256i*)(dst_argb + 96), ymm3);
-
-    src_raw += 96;
-    dst_argb += 128;
-    width -= 32;
-  }
-}
-#endif
-
-#ifdef HAS_RAWTOARGBROW_AVX512BW
-LIBYUV_TARGET_AVX512BW
-void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const __m128i* shuffler, int width) {
-  __m512i zmm_alpha = _mm512_set1_epi32(0xff000000);
-  __m512i zmm_perm = _mm512_set_epi32(
-      12, 11, 10, 9, 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 1, 0);
-  __m512i zmm_shuf = _mm512_broadcast_i32x4(_mm_loadu_si128(shuffler));
-
-  while (width > 0) {
-    __m512i zmm0 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw);
-    __m512i zmm1 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 48);
-    __m512i zmm2 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 96);
-    __m512i zmm3 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 144);
-
-    zmm0 = _mm512_permutexvar_epi32(zmm_perm, zmm0);
-    zmm1 = _mm512_permutexvar_epi32(zmm_perm, zmm1);
-    zmm2 = _mm512_permutexvar_epi32(zmm_perm, zmm2);
-    zmm3 = _mm512_permutexvar_epi32(zmm_perm, zmm3);
-
-    zmm0 = _mm512_shuffle_epi8(zmm0, zmm_shuf);
-    zmm1 = _mm512_shuffle_epi8(zmm1, zmm_shuf);
-    zmm2 = _mm512_shuffle_epi8(zmm2, zmm_shuf);
-    zmm3 = _mm512_shuffle_epi8(zmm3, zmm_shuf);
-
-    zmm0 = _mm512_or_si512(zmm0, zmm_alpha);
-    zmm1 = _mm512_or_si512(zmm1, zmm_alpha);
-    zmm2 = _mm512_or_si512(zmm2, zmm_alpha);
-    zmm3 = _mm512_or_si512(zmm3, zmm_alpha);
-
-    _mm512_storeu_si512(dst_argb, zmm0);
-    _mm512_storeu_si512(dst_argb + 64, zmm1);
-    _mm512_storeu_si512(dst_argb + 128, zmm2);
-    _mm512_storeu_si512(dst_argb + 192, zmm3);
-
-    src_raw += 192;
-    dst_argb += 256;
-    width -= 64;
-  }
-}
-
-LIBYUV_TARGET_AVX512BW
-void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  __m128i shuf = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
-  RGBToARGBRow_AVX512BW(src_raw, dst_argb, &shuf, width);
-}
-
-LIBYUV_TARGET_AVX512BW
-void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
-  __m128i shuf = _mm_set_epi8(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0);
-  RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, &shuf, width);
-}
-#endif
-
-#ifdef HAS_ARGBTOUVMATRIXROW_AVX2
-LIBYUV_TARGET_AVX2 __attribute__((no_sanitize("cfi-icall")))
-void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
-                            int src_stride_argb,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width,
-                            const struct ArgbConstants* c) {
-  __m256i ymm_u = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU));
-  __m256i ymm_v = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV));
-  __m256i ymm_0101 = _mm256_set1_epi16(0x0101);
-  __m256i ymm_shuf = _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
-                                      0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
-  __m256i ymm_8000 = _mm256_set1_epi16((short)0x8000);
-  __m256i ymm_zero = _mm256_setzero_si256();
-
-  while (width > 0) {
-    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb);
-    __m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
-    __m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb));
-    __m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32));
-
-    ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
-    ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
-    ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf);
-    ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf);
-
-    ymm0 = _mm256_maddubs_epi16(ymm0, ymm_0101);
-    ymm1 = _mm256_maddubs_epi16(ymm1, ymm_0101);
-    ymm2 = _mm256_maddubs_epi16(ymm2, ymm_0101);
-    ymm3 = _mm256_maddubs_epi16(ymm3, ymm_0101);
-
-    ymm0 = _mm256_add_epi16(ymm0, ymm2);
-    ymm1 = _mm256_add_epi16(ymm1, ymm3);
-
-    ymm0 = _mm256_srli_epi16(ymm0, 1);
-    ymm1 = _mm256_srli_epi16(ymm1, 1);
-    ymm0 = _mm256_avg_epu16(ymm0, ymm_zero);
-    ymm1 = _mm256_avg_epu16(ymm1, ymm_zero);
-
-    ymm0 = _mm256_packus_epi16(ymm0, ymm1);
-    ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8);
-
-    ymm1 = _mm256_maddubs_epi16(ymm0, ymm_v);
-    ymm0 = _mm256_maddubs_epi16(ymm0, ymm_u);
-
-    ymm0 = _mm256_hadd_epi16(ymm0, ymm1);
-    ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8);
-    ymm0 = _mm256_sub_epi16(ymm_8000, ymm0);
-    ymm0 = _mm256_srli_epi16(ymm0, 8);
-    ymm0 = _mm256_packus_epi16(ymm0, ymm0);
-
-    __m128i xmm_u = _mm256_castsi256_si128(ymm0);
-    __m128i xmm_v = _mm256_extracti128_si256(ymm0, 1);
-
-    _mm_storel_epi64((__m128i*)dst_u, xmm_u);
-    _mm_storel_epi64((__m128i*)dst_v, xmm_v);
-
-    src_argb += 64;
-    dst_u += 8;
-    dst_v += 8;
-    width -= 16;
-  }
-}
-#endif
-
-#ifdef HAS_MERGEUVROW_AVX2
-LIBYUV_TARGET_AVX2
-void MergeUVRow_AVX2(const uint8_t* src_u,
-                     const uint8_t* src_v,
-                     uint8_t* dst_uv,
-                     int width) {
-  while (width > 0) {
-    __m256i ymm0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_u));
-    __m256i ymm1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_v));
-
-    ymm1 = _mm256_slli_epi16(ymm1, 8);
-    ymm0 = _mm256_or_si256(ymm0, ymm1);
-
-    _mm256_storeu_si256((__m256i*)dst_uv, ymm0);
-
-    src_u += 16;
-    src_v += 16;
-    dst_uv += 32;
-    width -= 16;
-  }
-}
-#endif
-
 #endif


--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@ -362,35 +362,36 @@ void ScaleRowDown4Box_C(const uint8_t* src_ptr,
                        ptrdiff_t src_stride,
                        uint8_t* dst,
                        int dst_width) {
+  intptr_t stride = src_stride;
  int x;
  for (x = 0; x < dst_width - 1; x += 2) {
    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-              src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-              src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
-              src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
-              src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
-              src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
-              src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >>
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
             4;
    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-              src_ptr[src_stride + 4] + src_ptr[src_stride + 5] +
-              src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
-              src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5] +
-              src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7] +
-              src_ptr[src_stride * 3 + 4] + src_ptr[src_stride * 3 + 5] +
-              src_ptr[src_stride * 3 + 6] + src_ptr[src_stride * 3 + 7] + 8) >>
+              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+              src_ptr[stride * 3 + 7] + 8) >>
             4;
    dst += 2;
    src_ptr += 8;
  }
  if (dst_width & 1) {
    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-              src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-              src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
-              src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
-              src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
-              src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
-              src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >>
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
             4;
  }
 }
@ -399,35 +400,36 @@ void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
                           ptrdiff_t src_stride,
                           uint16_t* dst,
                           int dst_width) {
+  intptr_t stride = src_stride;
  int x;
  for (x = 0; x < dst_width - 1; x += 2) {
    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-              src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-              src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
-              src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
-              src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
-              src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
-              src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >>
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
             4;
    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-              src_ptr[src_stride + 4] + src_ptr[src_stride + 5] +
-              src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
-              src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5] +
-              src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7] +
-              src_ptr[src_stride * 3 + 4] + src_ptr[src_stride * 3 + 5] +
-              src_ptr[src_stride * 3 + 6] + src_ptr[src_stride * 3 + 7] + 8) >>
+              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+              src_ptr[stride * 3 + 7] + 8) >>
             4;
    dst += 2;
    src_ptr += 8;
  }
  if (dst_width & 1) {
    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-              src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-              src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
-              src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
-              src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
-              src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
-              src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >>
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
             4;
  }
 }
@ -890,26 +892,27 @@ void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width) {
+  intptr_t stride = src_stride;
  int i;
  assert((dst_width % 3 == 0) && (dst_width > 0));
  for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-                  src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-                  src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
-                  src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
-                     (65536 / 9) >>
-                 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-                  src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
-                  src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
-                  src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
-                     (65536 / 9) >>
-                 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
-                  src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] +
-                  src_ptr[src_stride * 2 + 7]) *
-                     (65536 / 6) >>
-                 16;
+    dst_ptr[0] =
+        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[1] =
+        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+            (65536 / 6) >>
+        16;
    src_ptr += 8;
    dst_ptr += 3;
  }
@ -919,26 +922,27 @@ void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint16_t* dst_ptr,
                               int dst_width) {
+  intptr_t stride = src_stride;
  int i;
  assert((dst_width % 3 == 0) && (dst_width > 0));
  for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-                  src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-                  src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
-                  src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
-                     (65536u / 9u) >>
-                 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-                  src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
-                  src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
-                  src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
-                     (65536u / 9u) >>
-                 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
-                  src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] +
-                  src_ptr[src_stride * 2 + 7]) *
-                     (65536u / 6u) >>
-                 16;
+    dst_ptr[0] =
+        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+            (65536u / 9u) >>
+        16;
+    dst_ptr[1] =
+        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+            (65536u / 9u) >>
+        16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+            (65536u / 6u) >>
+        16;
    src_ptr += 8;
    dst_ptr += 3;
  }
@ -949,23 +953,22 @@ void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width) {
+  intptr_t stride = src_stride;
  int i;
  assert((dst_width % 3 == 0) && (dst_width > 0));
  for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] =
-        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[src_stride + 0] +
-         src_ptr[src_stride + 1] + src_ptr[src_stride + 2]) *
-            (65536 / 6) >>
-        16;
-    dst_ptr[1] =
-        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[src_stride + 3] +
-         src_ptr[src_stride + 4] + src_ptr[src_stride + 5]) *
-            (65536 / 6) >>
-        16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
-                  src_ptr[src_stride + 7]) *
-                     (65536 / 4) >>
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+                  src_ptr[stride + 1] + src_ptr[stride + 2]) *
+                     (65536 / 6) >>
                 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+                  src_ptr[stride + 4] + src_ptr[stride + 5]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+            (65536 / 4) >>
+        16;
    src_ptr += 8;
    dst_ptr += 3;
  }
@ -975,23 +978,22 @@ void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint16_t* dst_ptr,
                               int dst_width) {
+  intptr_t stride = src_stride;
  int i;
  assert((dst_width % 3 == 0) && (dst_width > 0));
  for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] =
-        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[src_stride + 0] +
-         src_ptr[src_stride + 1] + src_ptr[src_stride + 2]) *
-            (65536u / 6u) >>
-        16;
-    dst_ptr[1] =
-        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[src_stride + 3] +
-         src_ptr[src_stride + 4] + src_ptr[src_stride + 5]) *
-            (65536u / 6u) >>
-        16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
-                  src_ptr[src_stride + 7]) *
-                     (65536u / 4u) >>
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+                  src_ptr[stride + 1] + src_ptr[stride + 2]) *
+                     (65536u / 6u) >>
                 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+                  src_ptr[stride + 4] + src_ptr[stride + 5]) *
+                     (65536u / 6u) >>
+                 16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+            (65536u / 4u) >>
+        16;
    src_ptr += 8;
    dst_ptr += 3;
  }
@ -1687,7 +1689,7 @@ void ScalePlaneVertical(int src_height,
    }
    yi = y >> 16;
    yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * (ptrdiff_t)src_stride, src_stride,
+    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
                   dst_width_bytes, yf);
    dst_argb += dst_stride;
    y += dy;
@ -1763,7 +1765,7 @@ void ScalePlaneVertical_16(int src_height,
    }
    yi = y >> 16;
    yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * (ptrdiff_t)src_stride, src_stride,
+    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
                   dst_width_words, yf);
    dst_argb += dst_stride;
    y += dy;
@ -1832,8 +1834,8 @@ void ScalePlaneVertical_16To8(int src_height,
    }
    yi = y >> 16;
    yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow_16To8(dst_argb, src_argb + yi * (ptrdiff_t)src_stride,
-                         src_stride, scale, dst_width_words, yf);
+    InterpolateRow_16To8(dst_argb, src_argb + yi * src_stride, src_stride,
+                         scale, dst_width_words, yf);
    dst_argb += dst_stride;
    y += dy;
  }
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@ -183,10 +183,10 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
      "lea         0x10(%1),%1                   \n"
      "sub         $0x10,%2                      \n"
      "jg          1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      : "r"(src_stride)  // %3
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }

@ -283,10 +283,10 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
      "sub         $0x20,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      : "r"(src_stride)  // %3
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SCALEROWDOWN2_AVX2
@ -326,7 +326,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width) {
-  ptrdiff_t stridex3;
+  intptr_t stridex3;
  asm volatile(
      "pcmpeqb     %%xmm4,%%xmm4                 \n"
      "pabsw       %%xmm4,%%xmm5                 \n"
@ -367,11 +367,11 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
      "lea         0x8(%1),%1                    \n"
      "sub         $0x8,%2                       \n"
      "jg          1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width),  // %2
-        "=&r"(stridex3)   // %3
-      : "r"(src_stride)   // %4
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width),             // %2
+        "=&r"(stridex3)              // %3
+      : "r"((intptr_t)(src_stride))  // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }

@ -456,11 +456,11 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
      "sub         $0x10,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_ptr),       // %0
-        "+r"(dst_ptr),       // %1
-        "+r"(dst_width)      // %2
-      : "r"(src_stride),     // %3
-        "r"(src_stride * 3)  // %4
+      : "+r"(src_ptr),                   // %0
+        "+r"(dst_ptr),                   // %1
+        "+r"(dst_width)                  // %2
+      : "r"((intptr_t)(src_stride)),     // %3
+        "r"((intptr_t)(src_stride * 3))  // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_SCALEROWDOWN4_AVX2
@ -557,11 +557,11 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
      "lea         0x18(%1),%1                   \n"
      "sub         $0x18,%2                      \n"
      "jg          1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width)   // %2
-      : "r"(src_stride),  // %3
-        "m"(kMadd21)      // %4
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
@ -625,11 +625,11 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
      "lea         0x18(%1),%1                   \n"
      "sub         $0x18,%2                      \n"
      "jg          1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width)   // %2
-      : "r"(src_stride),  // %3
-        "m"(kMadd21)      // %4
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
@ -701,10 +701,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
      "lea         0x6(%1),%1                    \n"
      "sub         $0x6,%2                       \n"
      "jg          1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      : "r"(src_stride)  // %3
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }

@ -762,10 +762,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
      "lea         0x6(%1),%1                    \n"
      "sub         $0x6,%2                       \n"
      "jg          1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      : "r"(src_stride)  // %3
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
@ -935,11 +935,11 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
      "sub         $0x10,%2                      \n"
      "jg          1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width)   // %2
-      : "r"(src_stride),  // %3
-        "r"(dst_stride)   // %4
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
@ -1084,12 +1084,12 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
      "sub         $0x10,%2                      \n"
      "jg          1b                            \n"
-      : "+r"(src_ptr),          // %0
-        "+r"(dst_ptr),          // %1
-        "+r"(dst_width)         // %2
-      : "r"(src_stride),        // %3
-        "r"(dst_stride),        // %4
-        "m"(kLinearShuffleFar)  // %5
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kLinearShuffleFar)        // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
@ -1246,11 +1246,11 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
      "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
      "sub         $0x8,%2                       \n"
      "jg          1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width)   // %2
-      : "r"(src_stride),  // %3
-        "r"(dst_stride)   // %4
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
@ -1371,12 +1371,12 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
      "sub         $0x10,%2                      \n"
      "jg          1b                            \n"
-      : "+r"(src_ptr),      // %0
-        "+r"(dst_ptr),      // %1
-        "+r"(dst_width)     // %2
-      : "r"(src_stride),    // %3
-        "r"(dst_stride),    // %4
-        "m"(kLinearMadd31)  // %5
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kLinearMadd31)            // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
@ -1497,12 +1497,12 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
      "sub         $0x20,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_ptr),      // %0
-        "+r"(dst_ptr),      // %1
-        "+r"(dst_width)     // %2
-      : "r"(src_stride),    // %3
-        "r"(dst_stride),    // %4
-        "m"(kLinearMadd31)  // %5
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kLinearMadd31)            // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
@ -1612,12 +1612,12 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
      "sub         $0x10,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_ptr),          // %0
-        "+r"(dst_ptr),          // %1
-        "+r"(dst_width)         // %2
-      : "r"(src_stride),        // %3
-        "r"(dst_stride),        // %4
-        "m"(kLinearShuffleFar)  // %5
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kLinearShuffleFar)        // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif
@ -1746,11 +1746,11 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
      "sub         $0x10,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width)   // %2
-      : "r"(src_stride),  // %3
-        "r"(dst_stride)   // %4
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif
@ -2016,10 +2016,10 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
      "lea         0x10(%1),%1                   \n"
      "sub         $0x4,%2                       \n"
      "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(dst_width)  // %2
-      : "r"(src_stride)  // %3
+      : "+r"(src_argb),              // %0
+        "+r"(dst_argb),              // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }

@ -2030,8 +2030,8 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
                               int src_stepx,
                               uint8_t* dst_argb,
                               int dst_width) {
-  ptrdiff_t src_stepx_x4 = (ptrdiff_t)(src_stepx);
-  ptrdiff_t src_stepx_x12;
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12;
  (void)src_stride;
  asm volatile(
      "lea         0x00(,%1,4),%1                \n"
@ -2067,8 +2067,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
                                  int src_stepx,
                                  uint8_t* dst_argb,
                                  int dst_width) {
-  ptrdiff_t src_stepx_x4 = (ptrdiff_t)(src_stepx);
-  ptrdiff_t src_stepx_x12;
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12;
+  intptr_t row1 = (intptr_t)(src_stride);
  asm volatile(
      "lea         0x00(,%1,4),%1                \n"
      "lea         0x00(%1,%1,2),%4              \n"
@ -2101,7 +2102,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
        "+r"(dst_argb),        // %2
        "+rm"(dst_width),      // %3
        "=&r"(src_stepx_x12),  // %4
-        "+r"(src_stride)       // %5
+        "+r"(row1)             // %5
      :
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
@ -2363,12 +2364,12 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
      "lea         0x8(%1),%1                    \n"  // 4 UV
      "sub         $0x4,%2                       \n"
      "jg          1b                            \n"
-      : "+r"(src_ptr),         // %0
-        "+r"(dst_ptr),         // %1
-        "+r"(dst_width)        // %2
-      : "r"(src_stride),       // %3
-        "m"(kShuffleSplitUV),  // %4
-        "m"(kShuffleMergeUV)   // %5
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kShuffleSplitUV),         // %4
+        "m"(kShuffleMergeUV)          // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_SCALEUVROWDOWN2BOX_SSSE3
@ -2404,12 +2405,12 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
      "sub         $0x8,%2                       \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_ptr),         // %0
-        "+r"(dst_ptr),         // %1
-        "+r"(dst_width)        // %2
-      : "r"(src_stride),       // %3
-        "m"(kShuffleSplitUV),  // %4
-        "m"(kShuffleMergeUV)   // %5
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kShuffleSplitUV),         // %4
+        "m"(kShuffleMergeUV)          // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_SCALEUVROWDOWN2BOX_AVX2
@ -2530,12 +2531,12 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
      "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
      "sub         $0x8,%2                       \n"
      "jg          1b                            \n"
-      : "+r"(src_ptr),        // %0
-        "+r"(dst_ptr),        // %1
-        "+r"(dst_width)       // %2
-      : "r"(src_stride),      // %3
-        "r"(dst_stride),      // %4
-        "m"(kUVLinearMadd31)  // %5
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kUVLinearMadd31)          // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
@ -2654,12 +2655,12 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
      "sub         $0x10,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_ptr),        // %0
-        "+r"(dst_ptr),        // %1
-        "+r"(dst_width)       // %2
-      : "r"(src_stride),      // %3
-        "r"(dst_stride),      // %4
-        "m"(kUVLinearMadd31)  // %5
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kUVLinearMadd31)          // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
@ -2798,11 +2799,11 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
      "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
      "sub         $0x4,%2                       \n"
      "jg          1b                            \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width)   // %2
-      : "r"(src_stride),  // %3
-        "r"(dst_stride)   // %4
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
@ -2929,11 +2930,11 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
      "sub         $0x8,%2                       \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_ptr),    // %0
-        "+r"(dst_ptr),    // %1
-        "+r"(dst_width)   // %2
-      : "r"(src_stride),  // %3
-        "r"(dst_stride)   // %4
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif
--- a/unit_test/convert_argb_test.cc
+++ b/unit_test/convert_argb_test.cc
@ -2827,8 +2827,9 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
  int has_large_malloc = 1;
 #endif
  if (!has_large_malloc) {
-    GTEST_SKIP() << "WARNING: Large allocation may assert for "
-                 << (size_t)kWidth * kHeight << " bytes";
+    printf("WARNING: Skipped.  Large allocation may assert for %zd\n",
+           (size_t)kWidth * kHeight);
+    return;
  }

  // Allocate one extra column so that the coalesce optimizations do not trigger
@ -2840,16 +2841,20 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
  fflush(stdout);
  align_buffer_page_end(orig_i400, (size_t)kWidth * kHeight);
  if (!orig_i400) {
-    GTEST_SKIP() << "WARNING: unable to allocate I400 image of "
-                 << (size_t)kWidth * kHeight << " bytes";
+    printf("WARNING: unable to allocate I400 image of %zd bytes\n",
+           (size_t)kWidth * kHeight);
+    fflush(stdout);
+    return;
  }
  printf("INFO: allocate I400 image returned %p\n", orig_i400);
  fflush(stdout);
  align_buffer_page_end(dest_argb, (size_t)kWidth * kHeight * 4);
  if (!dest_argb) {
+    printf("WARNING: unable to allocate ARGB image of %zd bytes\n",
+           (size_t)kWidth * kHeight * 4);
+    fflush(stdout);
    free_aligned_buffer_page_end(orig_i400);
-    GTEST_SKIP() << "WARNING: unable to allocate ARGB image of "
-                 << (size_t)kWidth * kHeight * 4 << " bytes";
+    return;
  }
  printf("INFO: allocate ARGB image returned %p\n", dest_argb);
  fflush(stdout);
@ -2867,72 +2872,4 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {

 #endif  // !defined(LEAN_TESTS)

-
-#define TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
-                   SUBSAMP_Y, W1280, N, NEG, OFF)                              \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                             \
-    const int kWidth = W1280;                                                  \
-    const int kHeight = benchmark_height_;                                     \
-    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;       \
-    const int kStrideA =                                                       \
-        (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                 \
-    const int kStrideY = kWidth;                                               \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2;                    \
-    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
-    align_buffer_page_end(src_argb,                                            \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
-    align_buffer_page_end(dst_y_c, kStrideY* kHeight);                         \
-    align_buffer_page_end(dst_uv_c, kSizeUV);                                  \
-    align_buffer_page_end(dst_y_opt, kStrideY* kHeight);                       \
-    align_buffer_page_end(dst_uv_opt, kSizeUV);                                \
-    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
-      src_argb[i + OFF] = (fastrand() & 0xff);                                 \
-    }                                                                          \
-    memset(dst_y_c, 1, kStrideY* kHeight);                                     \
-    memset(dst_uv_c, 2, kSizeUV);                                              \
-    memset(dst_y_opt, 101, kStrideY* kHeight);                                 \
-    memset(dst_uv_opt, 102, kSizeUV);                                          \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, dst_y_c, kStrideY,   \
-                     dst_uv_c, kStrideUV, kWidth, NEG kHeight);                \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, dst_y_opt,         \
-                       kStrideY, dst_uv_opt, kStrideUV, kWidth, NEG kHeight);  \
-    }                                                                          \
-    for (int i = 0; i < kStrideY * kHeight; ++i) {                             \
-      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                     \
-    }                                                                          \
-    for (int i = 0; i < kSizeUV; ++i) {                                        \
-      EXPECT_EQ(dst_uv_c[i], dst_uv_opt[i]);                                   \
-    }                                                                          \
-    free_aligned_buffer_page_end(src_argb);                                    \
-    free_aligned_buffer_page_end(dst_y_c);                                     \
-    free_aligned_buffer_page_end(dst_uv_c);                                    \
-    free_aligned_buffer_page_end(dst_y_opt);                                   \
-    free_aligned_buffer_page_end(dst_uv_opt);                                  \
-  }
-
-#if defined(ENABLE_FULL_TESTS)
-#define TESTATOBP(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
-                  SUBSAMP_Y)                                                  \
-  TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X,      \
-             SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0)                     \
-  TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X,      \
-             SUBSAMP_Y, benchmark_width_, _Unaligned, +, 4)                   \
-  TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X,      \
-             SUBSAMP_Y, benchmark_width_, _Invert, -, 0)                      \
-  TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X,      \
-             SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
-#else
-#define TESTATOBP(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
-                  SUBSAMP_Y)                                                  \
-  TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X,      \
-             SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
-#endif
-
-TESTATOBP(RAW, uint8_t, 3, 3, 1, NV21, 2, 2)
-TESTATOBP(RGB24, uint8_t, 3, 3, 1, NV12, 2, 2)
-TESTATOBP(RAW, uint8_t, 3, 3, 1, JNV21, 2, 2)
-
 }  // namespace libyuv
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -825,6 +825,7 @@ TESTATOBP(ARGB, 1, 4, NV12, 2, 2)
 TESTATOBP(ARGB, 1, 4, NV21, 2, 2)
 TESTATOBP(ABGR, 1, 4, NV12, 2, 2)
 TESTATOBP(ABGR, 1, 4, NV21, 2, 2)
+TESTATOBP(RAW, 1, 3, JNV21, 2, 2)
 TESTATOBP(YUY2, 2, 4, NV12, 2, 2)
 TESTATOBP(UYVY, 2, 4, NV12, 2, 2)
 TESTATOBP(AYUV, 1, 4, NV12, 2, 2)
--- a/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
@ -892,11 +892,6 @@ TEST_F(LibYUVRotateTest, Transpose4x4_Test) {
      Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
                           (uint8_t*)dst_pixels_opt, width * 4, width);
    } else
-#elif defined(HAS_TRANSPOSE4X4_32_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2)) {
-      Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
-                           (uint8_t*)dst_pixels_opt, width * 4, width);
-    } else
 #endif
    {
      Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
--- a/unit_test/scale_plane_test.cc
+++ b/unit_test/scale_plane_test.cc
@ -8,14 +8,9 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include <limits.h>
-#include <stdint.h>
 #include <stdlib.h>
-#include <string.h>
 #include <time.h>

-#include <new>
-
 #include "../unit_test/unit_test.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/scale.h"
@ -43,95 +38,6 @@
 namespace libyuv {

 #ifdef ENABLE_ROW_TESTS
-#ifdef HAS_SCALEROWDOWN2_SSSE3
-TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
-  SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]);
-  SIMD_ALIGNED(uint8_t dst_pixels_opt[64]);
-  SIMD_ALIGNED(uint8_t dst_pixels_c[64]);
-  memset(orig_pixels, 0, sizeof(orig_pixels));
-  memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt));
-  memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
-
-  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
-  if (!has_ssse3) {
-    printf("Warning SSSE3 not detected; Skipping test.\n");
-  } else {
-    // TL.
-    orig_pixels[0] = 255u;
-    orig_pixels[1] = 0u;
-    orig_pixels[128 + 0] = 0u;
-    orig_pixels[128 + 1] = 0u;
-    // TR.
-    orig_pixels[2] = 0u;
-    orig_pixels[3] = 100u;
-    orig_pixels[128 + 2] = 0u;
-    orig_pixels[128 + 3] = 0u;
-    // BL.
-    orig_pixels[4] = 0u;
-    orig_pixels[5] = 0u;
-    orig_pixels[128 + 4] = 50u;
-    orig_pixels[128 + 5] = 0u;
-    // BR.
-    orig_pixels[6] = 0u;
-    orig_pixels[7] = 0u;
-    orig_pixels[128 + 6] = 0u;
-    orig_pixels[128 + 7] = 20u;
-    // Odd.
-    orig_pixels[126] = 4u;
-    orig_pixels[127] = 255u;
-    orig_pixels[128 + 126] = 16u;
-    orig_pixels[128 + 127] = 255u;
-
-    // Test regular half size.
-    ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64);
-
-    EXPECT_EQ(64u, dst_pixels_c[0]);
-    EXPECT_EQ(25u, dst_pixels_c[1]);
-    EXPECT_EQ(13u, dst_pixels_c[2]);
-    EXPECT_EQ(5u, dst_pixels_c[3]);
-    EXPECT_EQ(0u, dst_pixels_c[4]);
-    EXPECT_EQ(133u, dst_pixels_c[63]);
-
-    // Test Odd width version - Last pixel is just 1 horizontal pixel.
-    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
-
-    EXPECT_EQ(64u, dst_pixels_c[0]);
-    EXPECT_EQ(25u, dst_pixels_c[1]);
-    EXPECT_EQ(13u, dst_pixels_c[2]);
-    EXPECT_EQ(5u, dst_pixels_c[3]);
-    EXPECT_EQ(0u, dst_pixels_c[4]);
-    EXPECT_EQ(10u, dst_pixels_c[63]);
-
-    // Test one pixel less, should skip the last pixel.
-    memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
-    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63);
-
-    EXPECT_EQ(64u, dst_pixels_c[0]);
-    EXPECT_EQ(25u, dst_pixels_c[1]);
-    EXPECT_EQ(13u, dst_pixels_c[2]);
-    EXPECT_EQ(5u, dst_pixels_c[3]);
-    EXPECT_EQ(0u, dst_pixels_c[4]);
-    EXPECT_EQ(0u, dst_pixels_c[63]);
-
-    // Test regular half size SSSE3.
-    ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
-
-    EXPECT_EQ(64u, dst_pixels_opt[0]);
-    EXPECT_EQ(25u, dst_pixels_opt[1]);
-    EXPECT_EQ(13u, dst_pixels_opt[2]);
-    EXPECT_EQ(5u, dst_pixels_opt[3]);
-    EXPECT_EQ(0u, dst_pixels_opt[4]);
-    EXPECT_EQ(133u, dst_pixels_opt[63]);
-
-    // Compare C and SSSE3 match.
-    ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
-    ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
-    for (int i = 0; i < 64; ++i) {
-      EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
-    }
-  }
-}
-#endif  // HAS_SCALEROWDOWN2_SSSE3

 TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
  SIMD_ALIGNED(uint16_t orig_pixels[2560 * 2]);
@ -467,71 +373,4 @@ TEST_F(LibYUVScaleTest, PlaneTest1_16_Box) {
  free_aligned_buffer_page_end(dst_pixels_alloc);
  free_aligned_buffer_page_end(orig_pixels_alloc);
 }
-
-// POC: int * int overflow in ScalePlaneVertical (scale_common.cc).
-//
-// `yi * src_stride` is evaluated as int * int. When the product exceeds
-// INT_MAX it wraps negative and InterpolateRow reads from BEFORE the
-// source allocation.
-//
-// Parameters:
-//   - dst_width == src_width
-//     -> ScalePlane dispatches to ScalePlaneVertical
-//   - src_height == 5, dst_height == 1
-//     -> single iteration with yi == 2
-//   - src_stride == 0x7FFFFFF8
-//     -> 2 * 0x7FFFFFF8 == 0xFFFFFFF0 == -16 (int)
-//
-// The source buffer is sized so that the *correct* 64-bit offset
-// (2 * 0x7FFFFFF8 == 4294967280) plus kWidth bytes is in-bounds. With the
-// bug, the 32-bit product is -16 and ASAN reports a heap-buffer-overflow
-// READ "16 bytes before" the allocation.
-TEST_F(LibYUVScaleTest, ScalePlaneVertical_IntStrideOverflow) {
-  const int kWidth = 16;
-  const int kSrcHeight = 5;
-  const int kDstHeight = 1;
-  const int kStride = 0x7FFFFFF8;  // 2147483640
-
-  // src_size is big enough for the only row this call legitimately touches
-  // (yi == 2) when computed in 64-bit: 2 * stride + width = 4 GiB.
-  size_t src_size = kStride;
-  if (src_size > SIZE_MAX / 2) {
-    GTEST_SKIP() << "could not represent allocation size in size_t";
-  }
-  src_size *= 2;
-  if (src_size > SIZE_MAX - kWidth) {
-    GTEST_SKIP() << "could not represent allocation size in size_t";
-  }
-  src_size += kWidth;
-
-#if defined(__aarch64__)
-  // Infer malloc can accept a large size for cpu with dot product (a76/a55)
-  int has_large_malloc = TestCpuFlag(kCpuHasNeonDotProd);
-#else
-  int has_large_malloc = 1;
-#endif
-  if (!has_large_malloc) {
-    GTEST_SKIP() << "large allocation may assert for " << src_size << " bytes";
-  }
-
-  uint8_t* src = new (std::nothrow) uint8_t[src_size];
-  if (!src) {
-    GTEST_SKIP() << "could not allocate " << src_size << " bytes";
-  }
-  uint8_t* dst = new uint8_t[kWidth];
-  memset(dst, 0, kWidth);
-
-  // Force the scalar path so the crash site is deterministic
-  // (InterpolateRow_C -> memcpy when yf == 0).
-  MaskCpuFlags(disable_cpu_flags_);
-
-  int r = ScalePlane(src, kStride, kWidth, kSrcHeight, dst, kWidth, kWidth,
-                     kDstHeight, kFilterNone);
-
-  // Not reached under ASAN.
-  EXPECT_EQ(0, r);
-  delete[] src;
-  delete[] dst;
-}
-
 }  // namespace libyuv