BGRAToI420 use BgraConstants for a direct conversion using AVX512BW

Bug: 42280902 Change-Id: I52cb2b0cacea8f2f0b138ec3cc521185dbef8595
2026-06-15 08:26:06 +08:00 · 2026-06-05 10:05:18 -07:00 · 2026-06-05 10:05:18 -07:00 · 51aa1f5ce2
commit 51aa1f5ce2
parent f722313c74
26 changed files with 1565 additions and 1691 deletions
--- a/GEMINI.md
+++ b/GEMINI.md
@ -1,44 +1,62 @@
 # Gemini Project Context: libyuv Row Functions
-This file provides context for the core row-processing architecture of libyuv. Use these guidelines when refactoring, reviewing, or generating code within the `row_*.cc` files.
+This file provides context for the core row-processing architecture of
 libyuv. Use these guidelines when refactoring, reviewing, or generating
 code within the `row_*.cc` files.
 ## Architectural Overview
-Libyuv uses a dispatch system where high-level conversion functions call optimized "Row" functions. These functions are categorized by SIMD architecture and compiler compatibility.
+Libyuv uses a dispatch system where high-level conversion functions call
 optimized "Row" functions. These functions are categorized by SIMD architecture
 and compiler compatibility.
 ## Source File Map
 ### x86 Architectures (32-bit and 64-bit)
-*   **row_gcc.cc**: **Master copy.** Contains inline assembly in GCC syntax for GCC and Clang. Supports AVX, and AVX512. AVX512 implementations are strictly for 64-bit targets.
+*   **row_gcc.cc**: **Master copy.** Contains inline assembly in GCC syntax for
-*   **row_win.cc**: Derivative of `row_gcc.cc`. Contains C++ intrinsics specifically for Visual C++ (MSVC). Can be tested with Clang using `-DLIBYUV_ENABLE_ROWWIN`.
+    GCC and Clang. Supports AVX, and AVX512. AVX512 implementations are strictly
    for 64-bit targets.
 *   **row_win.cc**: Derivative of `row_gcc.cc`. Contains C++ intrinsics
    specifically for Visual C++ (MSVC). Can be tested with Clang using
    `-DLIBYUV_ENABLE_ROWWIN`.
 *   **Note**: Use either `row_gcc` or `row_win`, never both.
 ### ARM Architectures
-*   **row_neon.cc**: 32-bit ARM. Written entirely in inline assembly for GCC/Clang.
+*   **row_neon.cc**: 32-bit ARM. Written entirely in inline assembly for
-*   **row_neon64.cc**: 64-bit ARM (AArch64). Written entirely in inline assembly for GCC/Clang.
+    GCC/Clang.
 *   **row_neon64.cc**: 64-bit ARM (AArch64). Written entirely in inline assembly
    for GCC/Clang.
 *   **row_sve.cc**: ARMv9 Scalable Vector Extensions (SVE).
-*   **row_sme.cc**: ARMv9 Scalable Matrix Extension (SME) and Streaming SVE (SSVE).
+*   **row_sme.cc**: ARMv9 Scalable Matrix Extension (SME) and Streaming SVE
    (SSVE).
 ### Other Architectures
-*   **row_rvv.cc**: RISC-V Vector (RVV). Implemented using intrinsics. Optimized for SiFive X280.
+*   **row_rvv.cc**: RISC-V Vector (RVV). Implemented using intrinsics. Optimized
    for SiFive X280.
 *   **row_lsx.cc / row_lasx.cc**: Loongarch MIPS-like extensions.
 ### Utility and Fallbacks
-*   **row_common.cc**: Portable C/C++ versions. This is the reference implementation.
+*   **row_common.cc**: Portable C/C++ versions. This is the reference
-*   **row_any.cc**: Handles "remainder" pixels for widths not multiples of SIMD register size. Used for x86, NEON, and MIPS. Not required for SVE, SME, or RVV due to hardware-level masking.
+    implementation.
 *   **row_any.cc**: Handles "remainder" pixels for widths not multiples of SIMD
    register size. Used for x86, NEON, and MIPS. Not required for SVE, SME, or
    RVV due to hardware-level masking.
 ## Coding Guidelines
-1.  **AVX512 Logic**: AVX512 row functions are strictly enabled for **64-bit x86 only**.
+1.  **AVX512 Logic**: AVX512 row functions are strictly enabled for **64-bit x86
-2.  **Feature Macros**: Use the `HAS_` macros in `include/libyuv/row.h` to enable or disable specific AVX512 versions.
+    only**.
 2.  **Feature Macros**: Use the `HAS_` macros in `include/libyuv/row.h` to
    enable or disable specific AVX512 versions.
 ## Changelist (CL) & Commit Guidelines
-When generating descriptions, follow the Chromium/Google standard format. Wrap commit message text at 72 characters
+When generating descriptions, follow the Chromium/Google standard format. Wrap
 commit message text at 72 characters
 ### Format Example:
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1946
+Version: 1947
 Revision: DEPS
 License: BSD-3-Clause
 License File: LICENSE
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@ -890,42 +890,21 @@ int ABGRToI420(const uint8_t* src_abgr,
 // BGRA little endian (argb in memory) to I422.
 LIBYUV_API
-int BGRAToI422(const uint8_t* src_bgra,
+int BGRAToI422(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_y,
-               int src_stride_bgra,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height);
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height);
 // ABGR little endian (rgba in memory) to I422.
 LIBYUV_API
-int ABGRToI422(const uint8_t* src_abgr,
+int ABGRToI422(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_y,
-               int src_stride_abgr,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height);
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height);
 // RGBA little endian (abgr in memory) to I422.
 LIBYUV_API
-int RGBAToI422(const uint8_t* src_rgba,
+int RGBAToI422(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_y,
-               int src_stride_rgba,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height);
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height);
 // RGBA little endian (abgr in memory) to I420.
 LIBYUV_API
--- a/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@ -247,16 +247,9 @@ int ARGBToI422(const uint8_t* src_argb,
 // Convert ABGR To I422.
 LIBYUV_API
-int ABGRToI422(const uint8_t* src_abgr,
+int ABGRToI422(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_y,
-               int src_stride_abgr,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height);
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height);
 // RGB to I444 with matrix. See ArgbConstants at the top of this file for usage.
 LIBYUV_API
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -352,9 +352,11 @@ extern "C" {
 #define HAS_RGB565TOARGBROW_AVX2
 #define HAS_ARGB1555TOARGBROW_AVX2
 #define HAS_ARGB4444TOARGBROW_AVX2
 #define HAS_ARGBSHUFFLEROW_AVX2
 #if defined(__x86_64__) || defined(_M_X64)
 #define HAS_RAWTOARGBROW_AVX512BW
 #define HAS_RGB24TOARGBROW_AVX512BW
 #define HAS_ARGBSHUFFLEROW_AVX512BW
 #endif
 #define HAS_ARGBTOYROW_AVX2
 #define HAS_ARGBTOYMATRIXROW_AVX2
@ -383,7 +385,6 @@ extern "C" {
 #endif
 #define HAS_ARGBTORGB24ROW_AVX512VBMI
 #define HAS_CONVERT16TO8ROW_AVX512BW
 #define HAS_MERGEUVROW_AVX512BW
 #endif
 // The following are available for AVX512 clang x64 platforms:
@ -401,6 +402,11 @@ extern "C" {
 #define HAS_ARGBTOUVJROW_AVX512BW
 #define HAS_ARGBTOUVMATRIXROW_AVX512BW
 #define HAS_J400TOARGBROW_AVX512BW
 #define HAS_MERGEUVROW_AVX512BW
 #define HAS_MIRRORROW_AVX512BW
 #define HAS_MIRRORSPLITUVROW_AVX512BW
 #define HAS_SPLITUVROW_AVX512BW
 #define HAS_RGBTOUVMATRIXROW_AVX512BW
 #endif
 // The following are available on Neon platforms:
@ -2180,29 +2186,27 @@ void ARGBToUVMatrixRow_C(const uint8_t* src_argb,
                         uint8_t* dst_v,
                         int width,
                         const struct ArgbConstants* c);
-void RGBToYMatrixRow_C(const uint8_t* src_rgb,
+void RGBToYMatrixRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width,
                       uint8_t* dst_y,
                       int width,
                       const struct ArgbConstants* c);
-void RGBToUVMatrixRow_C(const uint8_t* src_rgb,
+void RGBToUVMatrixRow_C(const uint8_t* src_rgb, int src_stride_rgb,
-                        int src_stride_rgb,
+                        uint8_t* dst_u, uint8_t* dst_v, int width,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width,
                        const struct ArgbConstants* c);
-void RGB565ToYMatrixRow_C(const uint8_t* src_rgb565,
+void RGB565ToYMatrixRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width,
                          uint8_t* dst_y,
                          int width,
                          const struct ArgbConstants* c);
-void ARGB1555ToYMatrixRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width, const struct ArgbConstants* c);
+void ARGB1555ToYMatrixRow_C(const uint8_t* src_argb1555, uint8_t* dst_y,
-void ARGB1555ToUVMatrixRow_C(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
+                            int width, const struct ArgbConstants* c);
-void ARGB4444ToYMatrixRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width, const struct ArgbConstants* c);
+void ARGB1555ToUVMatrixRow_C(const uint8_t* src_argb1555,
-void ARGB4444ToUVMatrixRow_C(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
+                             int src_stride_argb1555, uint8_t* dst_u,
-void RGB565ToUVMatrixRow_C(const uint8_t* src_rgb565,
+                             uint8_t* dst_v, int width,
-                           int src_stride_rgb565,
+                             const struct ArgbConstants* c);
-                           uint8_t* dst_u,
+void ARGB4444ToYMatrixRow_C(const uint8_t* src_argb4444, uint8_t* dst_y,
-                           uint8_t* dst_v,
+                            int width, const struct ArgbConstants* c);
-                           int width,
+void ARGB4444ToUVMatrixRow_C(const uint8_t* src_argb4444,
                             int src_stride_argb4444, uint8_t* dst_u,
                             uint8_t* dst_v, int width,
                             const struct ArgbConstants* c);
 void RGB565ToUVMatrixRow_C(const uint8_t* src_rgb565, int src_stride_rgb565,
                           uint8_t* dst_u, uint8_t* dst_v, int width,
                           const struct ArgbConstants* c);
 void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
                             int src_stride_argb,
@ -2210,8 +2214,18 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
                             uint8_t* dst_v,
                             int width,
                             const struct ArgbConstants* c);
-void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
+void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb, int src_stride_rgb,
-void RGBToUVMatrixRow_Any_AVX2(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
+                           uint8_t* dst_u, uint8_t* dst_v, int width,
                           const struct ArgbConstants* c);
 void RGBToUVMatrixRow_Any_AVX2(const uint8_t* src_rgb, int src_stride_rgb,
                               uint8_t* dst_u, uint8_t* dst_v, int width,
                               const struct ArgbConstants* c);
 void RGBToUVMatrixRow_AVX512BW(const uint8_t* src_rgb, int src_stride_rgb,
                               uint8_t* dst_u, uint8_t* dst_v, int width,
                               const struct ArgbConstants* c);
 void RGBToUVMatrixRow_Any_AVX512BW(const uint8_t* src_rgb, int src_stride_rgb,
                                   uint8_t* dst_u, uint8_t* dst_v, int width,
                                   const struct ArgbConstants* c);
 void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
                            int src_stride_argb,
                            uint8_t* dst_u,
@ -2233,86 +2247,80 @@ void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb,
                            uint8_t* dst_y,
                            int width,
                            const struct ArgbConstants* c);
-void RGBToYMatrixRow_AVX2(const uint8_t* src_rgb,
+void RGBToYMatrixRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_y, int width,
                          uint8_t* dst_y,
                          int width,
                          const struct ArgbConstants* c);
-void RGBToYMatrixRow_Any_AVX2(const uint8_t* src_rgb,
+void RGBToYMatrixRow_Any_AVX2(const uint8_t* src_rgb, uint8_t* dst_y, int width,
                              uint8_t* dst_y,
                              int width,
                              const struct ArgbConstants* c);
-void RGB565ToYMatrixRow_AVX2(const uint8_t* src_rgb565,
+void RGB565ToYMatrixRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_y,
-                             uint8_t* dst_y,
+                             int width, const struct ArgbConstants* c);
-                             int width,
+void RGB565ToYMatrixRow_Any_AVX2(const uint8_t* src_rgb565, uint8_t* dst_y,
-                             const struct ArgbConstants* c);
+                                 int width, const struct ArgbConstants* c);
-void RGB565ToYMatrixRow_Any_AVX2(const uint8_t* src_rgb565,
+void ARGB1555ToYMatrixRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_y,
-                                 uint8_t* dst_y,
+                               int width, const struct ArgbConstants* c);
-                                 int width,
+void ARGB1555ToYMatrixRow_Any_AVX2(const uint8_t* src_argb1555, uint8_t* dst_y,
-                                 const struct ArgbConstants* c);
+                                   int width, const struct ArgbConstants* c);
 void ARGB1555ToYMatrixRow_AVX2(const uint8_t* src_argb1555,
                               uint8_t* dst_y,
                               int width,
                               const struct ArgbConstants* c);
 void ARGB1555ToYMatrixRow_Any_AVX2(const uint8_t* src_argb1555,
                                   uint8_t* dst_y,
                                   int width,
                                   const struct ArgbConstants* c);
 void ARGB1555ToUVMatrixRow_AVX2(const uint8_t* src_argb1555,
-                                int src_stride_argb1555,
+                                int src_stride_argb1555, uint8_t* dst_u,
-                                uint8_t* dst_u,
+                                uint8_t* dst_v, int width,
                                uint8_t* dst_v,
                                int width,
                                const struct ArgbConstants* c);
 void ARGB1555ToUVMatrixRow_Any_AVX2(const uint8_t* src_argb1555,
-                                    int src_stride_argb1555,
+                                    int src_stride_argb1555, uint8_t* dst_u,
-                                    uint8_t* dst_u,
+                                    uint8_t* dst_v, int width,
                                    uint8_t* dst_v,
                                    int width,
                                    const struct ArgbConstants* c);
-void ARGB4444ToYMatrixRow_AVX2(const uint8_t* src_argb4444,
+void ARGB4444ToYMatrixRow_AVX2(const uint8_t* src_argb4444, uint8_t* dst_y,
-                               uint8_t* dst_y,
+                               int width, const struct ArgbConstants* c);
-                               int width,
+void ARGB4444ToYMatrixRow_Any_AVX2(const uint8_t* src_argb4444, uint8_t* dst_y,
-                               const struct ArgbConstants* c);
+                                   int width, const struct ArgbConstants* c);
 void ARGB4444ToYMatrixRow_Any_AVX2(const uint8_t* src_argb4444,
                                   uint8_t* dst_y,
                                   int width,
                                   const struct ArgbConstants* c);
 void ARGB4444ToUVMatrixRow_AVX2(const uint8_t* src_argb4444,
-                                int src_stride_argb4444,
+                                int src_stride_argb4444, uint8_t* dst_u,
-                                uint8_t* dst_u,
+                                uint8_t* dst_v, int width,
                                uint8_t* dst_v,
                                int width,
                                const struct ArgbConstants* c);
 void ARGB4444ToUVMatrixRow_Any_AVX2(const uint8_t* src_argb4444,
-                                    int src_stride_argb4444,
+                                    int src_stride_argb4444, uint8_t* dst_u,
-                                    uint8_t* dst_u,
+                                    uint8_t* dst_v, int width,
                                    uint8_t* dst_v,
                                    int width,
                                    const struct ArgbConstants* c);
-void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565,
+void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565, int src_stride_rgb565,
-                              int src_stride_rgb565,
+                              uint8_t* dst_u, uint8_t* dst_v, int width,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
                              int width,
                              const struct ArgbConstants* c);
 void RGB565ToUVMatrixRow_Any_AVX2(const uint8_t* src_rgb565,
-                                  int src_stride_rgb565,
+                                  int src_stride_rgb565, uint8_t* dst_u,
-                                  uint8_t* dst_u,
+                                  uint8_t* dst_v, int width,
-                                  uint8_t* dst_v,
+                                  const struct ArgbConstants* c);
-                                  int width,
+void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y,
                             int width, const struct ArgbConstants* c);
 void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y,
                               int width, const struct ArgbConstants* c);
 void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555,
                                int src_stride_argb1555, uint8_t* dst_u,
                                uint8_t* dst_v, int width,
                                const struct ArgbConstants* c);
 void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y,
                               int width, const struct ArgbConstants* c);
 void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444,
                                int src_stride_argb4444, uint8_t* dst_u,
                                uint8_t* dst_v, int width,
                                const struct ArgbConstants* c);
 void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565, int src_stride_rgb565,
                              uint8_t* dst_u, uint8_t* dst_v, int width,
                              const struct ArgbConstants* c);
 void RGB565ToYMatrixRow_Any_NEON(const uint8_t* src_rgb565, uint8_t* dst_y,
                                 int width, const struct ArgbConstants* c);
 void ARGB1555ToYMatrixRow_Any_NEON(const uint8_t* src_argb1555, uint8_t* dst_y,
                                   int width, const struct ArgbConstants* c);
 void ARGB1555ToUVMatrixRow_Any_NEON(const uint8_t* src_argb1555,
                                    int src_stride_argb1555, uint8_t* dst_u,
                                    uint8_t* dst_v, int width,
                                    const struct ArgbConstants* c);
 void ARGB4444ToYMatrixRow_Any_NEON(const uint8_t* src_argb4444, uint8_t* dst_y,
                                   int width, const struct ArgbConstants* c);
 void ARGB4444ToUVMatrixRow_Any_NEON(const uint8_t* src_argb4444,
                                    int src_stride_argb4444, uint8_t* dst_u,
                                    uint8_t* dst_v, int width,
                                    const struct ArgbConstants* c);
 void RGB565ToUVMatrixRow_Any_NEON(const uint8_t* src_rgb565,
                                  int src_stride_rgb565, uint8_t* dst_u,
                                  uint8_t* dst_v, int width,
                                  const struct ArgbConstants* c);
 void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width, const struct ArgbConstants* c);
 void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width, const struct ArgbConstants* c);
 void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
 void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width, const struct ArgbConstants* c);
 void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
 void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
 void RGB565ToYMatrixRow_Any_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width, const struct ArgbConstants* c);
 void ARGB1555ToYMatrixRow_Any_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width, const struct ArgbConstants* c);
 void ARGB1555ToUVMatrixRow_Any_NEON(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
 void ARGB4444ToYMatrixRow_Any_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width, const struct ArgbConstants* c);
 void ARGB4444ToUVMatrixRow_Any_NEON(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
 void RGB565ToUVMatrixRow_Any_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
 void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
                           uint8_t* dst_y,
@ -2340,9 +2348,14 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                          int width,
                          const struct ArgbConstants* c);
-void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
+void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb, int src_stride_rgb,
-void RGBToYMatrixRow_Any_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width, const struct ArgbConstants* c);
+                           uint8_t* dst_u, uint8_t* dst_v, int width,
-void RGBToUVMatrixRow_Any_NEON(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
+                           const struct ArgbConstants* c);
 void RGBToYMatrixRow_Any_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width,
                              const struct ArgbConstants* c);
 void RGBToUVMatrixRow_Any_NEON(const uint8_t* src_rgb, int src_stride_rgb,
                               uint8_t* dst_u, uint8_t* dst_v, int width,
                               const struct ArgbConstants* c);
 void ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb,
                                   uint8_t* dst_y,
@ -3040,12 +3053,15 @@ void ARGBToUVJ444Row_C(const uint8_t* src_argb,
                       uint8_t* dst_v,
                       int width);
 void MirrorRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr,
                            int width);
 void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
@ -3063,9 +3079,9 @@ void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorSplitUVRow_AVX2(const uint8_t* src,
+void MirrorSplitUVRow_AVX512BW(const uint8_t* src, uint8_t* dst_u,
-                           uint8_t* dst_u,
+                               uint8_t* dst_v, int width);
-                           uint8_t* dst_v,
+void MirrorSplitUVRow_AVX2(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
                           int width);
 void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
                           uint8_t* dst_u,
@ -3102,15 +3118,13 @@ void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            int width);
-void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
+void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_rgb24,
                         uint8_t* dst_rgb24,
                         int width);
 void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
                         uint8_t* dst_rgb24,
                         int width);
 void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width);
-void RGB24MirrorRow_Any_AVX2(const uint8_t* src_ptr,
+void RGB24MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr,
                             uint8_t* dst_ptr,
                             int width);
 void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
@ -3124,6 +3138,8 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
 void SplitUVRow_AVX512BW(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
                         int width);
 void SplitUVRow_AVX2(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
@ -3140,6 +3156,8 @@ void SplitUVRow_RVV(const uint8_t* src_uv,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
 void SplitUVRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_u,
                             uint8_t* dst_v, int width);
 void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
@ -4114,10 +4132,8 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
                         uint8_t* dst_argb,
                         const uint8_t* shuffler,
                         int width);
-void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb,
+void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_argb,
-                             uint8_t* dst_argb,
+                             const uint8_t* shuffler, int width);
                             const uint8_t* shuffler,
                             int width);
 void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                         uint8_t* dst_argb,
                         const uint8_t* shuffler,
@ -4138,10 +4154,8 @@ void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             const uint8_t* param,
                             int width);
-void ARGBShuffleRow_Any_AVX512BW(const uint8_t* src_ptr,
+void ARGBShuffleRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr,
-                                 uint8_t* dst_ptr,
+                                 const uint8_t* param, int width);
                                 const uint8_t* param,
                                 int width);
 void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             const uint8_t* param,
@ -4160,7 +4174,8 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
                          int width);
 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width);
-void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb,
                         int width);
 void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
@ -4250,20 +4265,14 @@ void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
 void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            int width);
-void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-                           uint8_t* dst_ptr,
+void RGB24ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr,
                           int width);
 void RGB24ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
-void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr,
+void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr,
                               uint8_t* dst_ptr,
                               int width);
-void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24,
+void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb,
                             uint8_t* dst_argb,
                             int width);
-void RGB24ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr,
+void RGB24ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr,
                                 uint8_t* dst_ptr,
                                 int width);
 void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
@ -4272,7 +4281,6 @@ void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
 void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
@ -4512,8 +4520,7 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width);
 void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void J400ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr,
+void J400ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr,
                                uint8_t* dst_ptr,
                                int width);
 void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
@ -4729,12 +4736,9 @@ void I444ToARGBRow_AVX2(const uint8_t* y_buf,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
-void I444ToARGBRow_AVX512BW(const uint8_t* y_buf,
+void I444ToARGBRow_AVX512BW(const uint8_t* y_buf, const uint8_t* u_buf,
-                            const uint8_t* u_buf,
+                            const uint8_t* v_buf, uint8_t* dst_argb,
-                            const uint8_t* v_buf,
+                            const struct YuvConstants* yuvconstants, int width);
                            uint8_t* dst_argb,
                            const struct YuvConstants* yuvconstants,
                            int width);
 void I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
                          const uint8_t* u_buf,
                          const uint8_t* v_buf,
@ -5057,10 +5061,8 @@ void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
-void I444ToARGBRow_Any_AVX512BW(const uint8_t* y_buf,
+void I444ToARGBRow_Any_AVX512BW(const uint8_t* y_buf, const uint8_t* u_buf,
-                                const uint8_t* u_buf,
+                                const uint8_t* v_buf, uint8_t* dst_ptr,
                                const uint8_t* v_buf,
                                uint8_t* dst_ptr,
                                const struct YuvConstants* yuvconstants,
                                int width);
 void I444ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
@ -6712,15 +6714,11 @@ void InterpolateRow_16_C(uint16_t* dst_ptr,
                         ptrdiff_t src_stride,
                         int width,
                         int source_y_fraction);
-void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
+void InterpolateRow_16_AVX2(uint16_t* dst_ptr, const uint16_t* src_ptr,
-                            const uint16_t* src_ptr,
+                            ptrdiff_t src_stride, int width,
                            ptrdiff_t src_stride,
                            int width,
                            int source_y_fraction);
-void InterpolateRow_16_Any_AVX2(uint16_t* dst_ptr,
+void InterpolateRow_16_Any_AVX2(uint16_t* dst_ptr, const uint16_t* src_ptr,
-                                const uint16_t* src_ptr,
+                                ptrdiff_t src_stride, int width,
                                ptrdiff_t src_stride,
                                int width,
                                int source_y_fraction);
 void InterpolateRow_16_NEON(uint16_t* dst_ptr,
                            const uint16_t* src_ptr,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1946
+#define LIBYUV_VERSION 1947
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
@ -13,12 +13,11 @@
 #include <limits.h>
 #include "libyuv/basic_types.h"
 #include "libyuv/convert_from_argb.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/convert_from_argb.h"
 #include "libyuv/rotate.h"
 #include "libyuv/row.h"
 #include "libyuv/scale.h"      // For ScalePlane()
 #include "libyuv/scale_row.h"  // For FixedDiv
 #include "libyuv/scale_uv.h"   // For UVScale()
@ -948,8 +947,7 @@ int I422ToNV21(const uint8_t* src_y,
  // Allocate u and v buffers
  const uint64_t plane_size = (uint64_t)halfwidth * halfheight;
-  if (plane_size > SIZE_MAX / 2)
+  if (plane_size > SIZE_MAX / 2) return 1;
    return 1;
  align_buffer_64(plane_u, (size_t)plane_size * 2);
  if (!plane_u)
    return 1;
@ -2034,8 +2032,8 @@ int ARGBToI420(const uint8_t* src_argb,
               int width,
               int height) {
  return ARGBToI420Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kArgbI601Constants,
+                          dst_stride_u, dst_v, dst_stride_v,
-                          width, height);
+                          &kArgbI601Constants, width, height);
 }
 LIBYUV_API
@ -2121,34 +2119,34 @@ ARGBToUVMatrixRow_C;
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-      if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-    if (TestCpuFlag(kCpuHasSVE2)) {
+  if (TestCpuFlag(kCpuHasSVE2)) {
-      if (IS_ALIGNED(width, 2)) {
+    if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SME)
-    if (TestCpuFlag(kCpuHasSME)) {
+  if (TestCpuFlag(kCpuHasSME)) {
-      if (IS_ALIGNED(width, 2)) {
+    if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
@ -2207,18 +2205,10 @@ ARGBToUVMatrixRow_C;
 // Convert ARGB to I420 with Alpha
 // The following version calls ARGBExtractAlpha on the full image.
 LIBYUV_API
-int ARGBToI420Alpha(const uint8_t* src_argb,
+int ARGBToI420Alpha(const uint8_t* src_argb, int src_stride_argb,
-                    int src_stride_argb,
+                    uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u,
-                    uint8_t* dst_y,
+                    int dst_stride_u, uint8_t* dst_v, int dst_stride_v,
-                    int dst_stride_y,
+                    uint8_t* dst_a, int dst_stride_a, int width, int height) {
                    uint8_t* dst_u,
                    int dst_stride_u,
                    uint8_t* dst_v,
                    int dst_stride_v,
                    uint8_t* dst_a,
                    int dst_stride_a,
                    int width,
                    int height) {
  int r = ARGBToI420(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
                     dst_stride_u, dst_v, dst_stride_v, width, height);
  if (r == 0) {
@ -2230,18 +2220,10 @@ int ARGBToI420Alpha(const uint8_t* src_argb,
 #else  // USE_EXTRACTALPHA
 // Convert ARGB to I420 with Alpha
 LIBYUV_API
-int ARGBToI420Alpha(const uint8_t* src_argb,
+int ARGBToI420Alpha(const uint8_t* src_argb, int src_stride_argb,
-                    int src_stride_argb,
+                    uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u,
-                    uint8_t* dst_y,
+                    int dst_stride_u, uint8_t* dst_v, int dst_stride_v,
-                    int dst_stride_y,
+                    uint8_t* dst_a, int dst_stride_a, int width, int height) {
                    uint8_t* dst_u,
                    int dst_stride_u,
                    uint8_t* dst_v,
                    int dst_stride_v,
                    uint8_t* dst_a,
                    int dst_stride_a,
                    int width,
                    int height) {
  int y;
  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
                      uint8_t* dst_u, uint8_t* dst_v, int width) =
@ -2428,104 +2410,62 @@ int ARGBToI420Alpha(const uint8_t* src_argb,
 // Convert BGRA to I420.
 LIBYUV_API
-int BGRAToI420(const uint8_t* src_bgra,
+int BGRAToI420(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_y,
-               int src_stride_bgra,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height) {
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height) {
  return ARGBToI420Matrix(src_bgra, src_stride_bgra, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kBgraI601Constants,
+                          dst_stride_u, dst_v, dst_stride_v,
-                          width, height);
+                          &kBgraI601Constants, width, height);
 }
 // Convert BGRA to I422.
 LIBYUV_API
-int BGRAToI422(const uint8_t* src_bgra,
+int BGRAToI422(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_y,
-               int src_stride_bgra,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height) {
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height) {
  return ARGBToI422Matrix(src_bgra, src_stride_bgra, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kBgraI601Constants,
+                          dst_stride_u, dst_v, dst_stride_v,
-                          width, height);
+                          &kBgraI601Constants, width, height);
 }
 // Convert ABGR to I422.
 LIBYUV_API
-int ABGRToI422(const uint8_t* src_abgr,
+int ABGRToI422(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_y,
-               int src_stride_abgr,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height) {
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height) {
  return ARGBToI422Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kAbgrI601Constants,
+                          dst_stride_u, dst_v, dst_stride_v,
-                          width, height);
+                          &kAbgrI601Constants, width, height);
 }
 // Convert RGBA to I422.
 LIBYUV_API
-int RGBAToI422(const uint8_t* src_rgba,
+int RGBAToI422(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_y,
-               int src_stride_rgba,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height) {
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height) {
  return ARGBToI422Matrix(src_rgba, src_stride_rgba, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kRgbaI601Constants,
+                          dst_stride_u, dst_v, dst_stride_v,
-                          width, height);
+                          &kRgbaI601Constants, width, height);
 }
 // Convert ABGR to I420.
 LIBYUV_API
-int ABGRToI420(const uint8_t* src_abgr,
+int ABGRToI420(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_y,
-               int src_stride_abgr,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height) {
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height) {
  return ARGBToI420Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kAbgrI601Constants,
+                          dst_stride_u, dst_v, dst_stride_v,
-                          width, height);
+                          &kAbgrI601Constants, width, height);
 }
 // Convert RGBA to I420.
 LIBYUV_API
-int RGBAToI420(const uint8_t* src_rgba,
+int RGBAToI420(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_y,
-               int src_stride_rgba,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height) {
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height) {
  return ARGBToI420Matrix(src_rgba, src_stride_rgba, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kRgbaI601Constants,
+                          dst_stride_u, dst_v, dst_stride_v,
-                          width, height);
+                          &kRgbaI601Constants, width, height);
 }
 // Enabled if 1 pass is available
@ -2536,16 +2476,9 @@ int RGBAToI420(const uint8_t* src_rgba,
 // Convert RGB24 to I420.
 LIBYUV_API
-int RGB24ToI420(const uint8_t* src_rgb24,
+int RGB24ToI420(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_y,
-                int src_stride_rgb24,
+                int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-                uint8_t* dst_y,
+                uint8_t* dst_v, int dst_stride_v, int width, int height) {
                int dst_stride_y,
                uint8_t* dst_u,
                int dst_stride_u,
                uint8_t* dst_v,
                int dst_stride_v,
                int width,
                int height) {
  int y;
  void (*RGBToUVMatrixRow)(const uint8_t* src_rgb, int src_stride_rgb,
                           uint8_t* dst_u, uint8_t* dst_v, int width,
@ -2569,6 +2502,14 @@ int RGB24ToI420(const uint8_t* src_rgb24,
    }
  }
 #endif
 #if defined(HAS_RGBTOUVMATRIXROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW)) {
    RGBToUVMatrixRow = RGBToUVMatrixRow_Any_AVX512BW;
    if (IS_ALIGNED(width, 64)) {
      RGBToUVMatrixRow = RGBToUVMatrixRow_AVX512BW;
    }
  }
 #endif
 #if defined(HAS_RGBTOUVMATRIXROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    RGBToUVMatrixRow = RGBToUVMatrixRow_Any_NEON;
@ -2603,9 +2544,11 @@ int RGB24ToI420(const uint8_t* src_rgb24,
  }
  for (y = 0; y < height - 1; y += 2) {
-    RGBToUVMatrixRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width, &kArgbI601Constants);
+    RGBToUVMatrixRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width,
                     &kArgbI601Constants);
    RGBToYMatrixRow(src_rgb24, dst_y, width, &kArgbI601Constants);
-    RGBToYMatrixRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width, &kArgbI601Constants);
+    RGBToYMatrixRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width,
                    &kArgbI601Constants);
    src_rgb24 += src_stride_rgb24 * 2;
    dst_y += dst_stride_y * 2;
    dst_u += dst_stride_u;
@ -2626,16 +2569,9 @@ int RGB24ToI420(const uint8_t* src_rgb24,
 // Convert RGB24 to J420.
 LIBYUV_API
-int RGB24ToJ420(const uint8_t* src_rgb24,
+int RGB24ToJ420(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_y,
-                int src_stride_rgb24,
+                int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-                uint8_t* dst_y,
+                uint8_t* dst_v, int dst_stride_v, int width, int height) {
                int dst_stride_y,
                uint8_t* dst_u,
                int dst_stride_u,
                uint8_t* dst_v,
                int dst_stride_v,
                int width,
                int height) {
  int y;
 #if defined(HAS_RGB24TOYJROW)
  void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
@ -2806,8 +2742,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
    // Allocate 2 rows of ARGB.
    const int row_size = (width * 4 + 31) & ~31;
    align_buffer_64(row, row_size * 2);
-    if (!row)
+    if (!row) return 1;
      return 1;
 #endif
    for (y = 0; y < height - 1; y += 2) {
@ -2853,16 +2788,9 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
 // Convert RAW to I420.
 LIBYUV_API
-int RAWToI420(const uint8_t* src_rgb24,
+int RAWToI420(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_y,
-                int src_stride_rgb24,
+              int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-                uint8_t* dst_y,
+              uint8_t* dst_v, int dst_stride_v, int width, int height) {
                int dst_stride_y,
                uint8_t* dst_u,
                int dst_stride_u,
                uint8_t* dst_v,
                int dst_stride_v,
                int width,
                int height) {
  int y;
  void (*RGBToUVMatrixRow)(const uint8_t* src_rgb, int src_stride_rgb,
                           uint8_t* dst_u, uint8_t* dst_v, int width,
@ -2886,6 +2814,14 @@ int RAWToI420(const uint8_t* src_rgb24,
    }
  }
 #endif
 #if defined(HAS_RGBTOUVMATRIXROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW)) {
    RGBToUVMatrixRow = RGBToUVMatrixRow_Any_AVX512BW;
    if (IS_ALIGNED(width, 64)) {
      RGBToUVMatrixRow = RGBToUVMatrixRow_AVX512BW;
    }
  }
 #endif
 #if defined(HAS_RGBTOUVMATRIXROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    RGBToUVMatrixRow = RGBToUVMatrixRow_Any_NEON;
@ -2920,9 +2856,11 @@ int RAWToI420(const uint8_t* src_rgb24,
  }
  for (y = 0; y < height - 1; y += 2) {
-    RGBToUVMatrixRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width, &kArgbI601Constants);
+    RGBToUVMatrixRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width,
                     &kArgbI601Constants);
    RGBToYMatrixRow(src_rgb24, dst_y, width, &kArgbI601Constants);
-    RGBToYMatrixRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width, &kArgbI601Constants);
+    RGBToYMatrixRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width,
                    &kArgbI601Constants);
    src_rgb24 += src_stride_rgb24 * 2;
    dst_y += dst_stride_y * 2;
    dst_u += dst_stride_u;
@ -2943,16 +2881,9 @@ int RAWToI420(const uint8_t* src_rgb24,
 // Convert RAW to J420.
 LIBYUV_API
-int RAWToJ420(const uint8_t* src_raw,
+int RAWToJ420(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_y,
-              int src_stride_raw,
+              int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-              uint8_t* dst_y,
+              uint8_t* dst_v, int dst_stride_v, int width, int height) {
              int dst_stride_y,
              uint8_t* dst_u,
              int dst_stride_u,
              uint8_t* dst_v,
              int dst_stride_v,
              int width,
              int height) {
  int y;
 #if defined(HAS_RAWTOYJROW)
  void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
@ -3161,16 +3092,9 @@ int RAWToJ420(const uint8_t* src_raw,
 // RAW big endian (rgb in memory) to I444
 // 2 step conversion of RAWToARGB then ARGBToY and ARGBToUV444
 LIBYUV_API
-int RAWToI444(const uint8_t* src_raw,
+int RAWToI444(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_y,
-              int src_stride_raw,
+              int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-              uint8_t* dst_y,
+              uint8_t* dst_v, int dst_stride_v, int width, int height) {
              int dst_stride_y,
              uint8_t* dst_u,
              int dst_stride_u,
              uint8_t* dst_v,
              int dst_stride_v,
              int width,
              int height) {
  int y;
  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
      RAWToARGBRow_C;
@ -3370,8 +3294,7 @@ int RAWToI444(const uint8_t* src_raw,
    // Allocate a row of ARGB.
    const int row_size = width * 4;
    align_buffer_64(row, row_size);
-    if (!row)
+    if (!row) return 1;
      return 1;
    for (y = 0; y < height; ++y) {
      RAWToARGBRow(src_raw, row, width);
@ -3390,16 +3313,9 @@ int RAWToI444(const uint8_t* src_raw,
 // RAW big endian (rgb in memory) to J444
 // 2 step conversion of RAWToARGB then ARGBToYJ and ARGBToUVJ444
 LIBYUV_API
-int RAWToJ444(const uint8_t* src_raw,
+int RAWToJ444(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_y,
-              int src_stride_raw,
+              int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-              uint8_t* dst_y,
+              uint8_t* dst_v, int dst_stride_v, int width, int height) {
              int dst_stride_y,
              uint8_t* dst_u,
              int dst_stride_u,
              uint8_t* dst_v,
              int dst_stride_v,
              int width,
              int height) {
  int y;
  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
      RAWToARGBRow_C;
@ -3590,8 +3506,7 @@ int RAWToJ444(const uint8_t* src_raw,
    // Allocate a row of ARGB.
    const int row_size = width * 4;
    align_buffer_64(row, row_size);
-    if (!row)
+    if (!row) return 1;
      return 1;
    for (y = 0; y < height; ++y) {
      RAWToARGBRow(src_raw, row, width);
@ -3609,22 +3524,18 @@ int RAWToJ444(const uint8_t* src_raw,
 // Convert RGB565 to I420.
 LIBYUV_API
-int RGB565ToI420(const uint8_t* src_rgb565,
+int RGB565ToI420(const uint8_t* src_rgb565, int src_stride_rgb565,
-                 int src_stride_rgb565,
+                 uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u,
-                 uint8_t* dst_y,
+                 int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width,
                 int dst_stride_y,
                 uint8_t* dst_u,
                 int dst_stride_u,
                 uint8_t* dst_v,
                 int dst_stride_v,
                 int width,
                 int height) {
  int y;
  void (*RGB565ToUVMatrixRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
                              uint8_t* dst_u, uint8_t* dst_v, int width,
-                              const struct ArgbConstants* c) = RGB565ToUVMatrixRow_C;
+                              const struct ArgbConstants* c) =
-  void (*RGB565ToYMatrixRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width,
+      RGB565ToUVMatrixRow_C;
-                             const struct ArgbConstants* c) = RGB565ToYMatrixRow_C;
+  void (*RGB565ToYMatrixRow)(const uint8_t* src_rgb565, uint8_t* dst_y,
                             int width, const struct ArgbConstants* c) =
      RGB565ToYMatrixRow_C;
 #if defined(HAS_RGB565TOYMATRIXROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
@ -3671,9 +3582,11 @@ int RGB565ToI420(const uint8_t* src_rgb565,
  }
  for (y = 0; y < height - 1; y += 2) {
-    RGB565ToUVMatrixRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width, &kArgbI601Constants);
+    RGB565ToUVMatrixRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width,
                        &kArgbI601Constants);
    RGB565ToYMatrixRow(src_rgb565, dst_y, width, &kArgbI601Constants);
-    RGB565ToYMatrixRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width, &kArgbI601Constants);
+    RGB565ToYMatrixRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y,
                       width, &kArgbI601Constants);
    src_rgb565 += src_stride_rgb565 * 2;
    dst_y += dst_stride_y * 2;
    dst_u += dst_stride_u;
@ -3681,30 +3594,25 @@ int RGB565ToI420(const uint8_t* src_rgb565,
  }
  if (height & 1) {
    RGB565ToYMatrixRow(src_rgb565, dst_y, width, &kArgbI601Constants);
-    RGB565ToUVMatrixRow(src_rgb565, 0, dst_u, dst_v, width, &kArgbI601Constants);
+    RGB565ToUVMatrixRow(src_rgb565, 0, dst_u, dst_v, width,
                        &kArgbI601Constants);
  }
  return 0;
 }
 // Convert ARGB1555 to I420.
 LIBYUV_API
-int ARGB1555ToI420(const uint8_t* src_argb1555,
+int ARGB1555ToI420(const uint8_t* src_argb1555, int src_stride_argb1555,
-                 int src_stride_argb1555,
+                   uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u,
-                 uint8_t* dst_y,
+                   int dst_stride_u, uint8_t* dst_v, int dst_stride_v,
-                 int dst_stride_y,
+                   int width, int height) {
                 uint8_t* dst_u,
                 int dst_stride_u,
                 uint8_t* dst_v,
                 int dst_stride_v,
                 int width,
                 int height) {
  int y;
  void (*ARGB1555ToUVMatrixRow)(
      const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u,
-      uint8_t* dst_v, int width,
+      uint8_t* dst_v, int width, const struct ArgbConstants* c) =
-      const struct ArgbConstants* c) = ARGB1555ToUVMatrixRow_C;
+      ARGB1555ToUVMatrixRow_C;
-  void (*ARGB1555ToYMatrixRow)(
+  void (*ARGB1555ToYMatrixRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
-      const uint8_t* src_argb1555, uint8_t* dst_y, int width,
+                               int width, const struct ArgbConstants* c) =
-      const struct ArgbConstants* c) = ARGB1555ToYMatrixRow_C;
+      ARGB1555ToYMatrixRow_C;
 #if defined(HAS_ARGB1555TOYMATRIXROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
@ -3751,9 +3659,11 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
  }
  for (y = 0; y < height - 1; y += 2) {
-    ARGB1555ToUVMatrixRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width, &kArgbI601Constants);
+    ARGB1555ToUVMatrixRow(src_argb1555, src_stride_argb1555, dst_u, dst_v,
                          width, &kArgbI601Constants);
    ARGB1555ToYMatrixRow(src_argb1555, dst_y, width, &kArgbI601Constants);
-    ARGB1555ToYMatrixRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, width, &kArgbI601Constants);
+    ARGB1555ToYMatrixRow(src_argb1555 + src_stride_argb1555,
                         dst_y + dst_stride_y, width, &kArgbI601Constants);
    src_argb1555 += src_stride_argb1555 * 2;
    dst_y += dst_stride_y * 2;
    dst_u += dst_stride_u;
@ -3761,30 +3671,25 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
  }
  if (height & 1) {
    ARGB1555ToYMatrixRow(src_argb1555, dst_y, width, &kArgbI601Constants);
-    ARGB1555ToUVMatrixRow(src_argb1555, 0, dst_u, dst_v, width, &kArgbI601Constants);
+    ARGB1555ToUVMatrixRow(src_argb1555, 0, dst_u, dst_v, width,
                          &kArgbI601Constants);
  }
  return 0;
 }
 // Convert ARGB4444 to I420.
 LIBYUV_API
-int ARGB4444ToI420(const uint8_t* src_argb4444,
+int ARGB4444ToI420(const uint8_t* src_argb4444, int src_stride_argb4444,
-                 int src_stride_argb4444,
+                   uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u,
-                 uint8_t* dst_y,
+                   int dst_stride_u, uint8_t* dst_v, int dst_stride_v,
-                 int dst_stride_y,
+                   int width, int height) {
                 uint8_t* dst_u,
                 int dst_stride_u,
                 uint8_t* dst_v,
                 int dst_stride_v,
                 int width,
                 int height) {
  int y;
  void (*ARGB4444ToUVMatrixRow)(
      const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u,
-      uint8_t* dst_v, int width,
+      uint8_t* dst_v, int width, const struct ArgbConstants* c) =
-      const struct ArgbConstants* c) = ARGB4444ToUVMatrixRow_C;
+      ARGB4444ToUVMatrixRow_C;
-  void (*ARGB4444ToYMatrixRow)(
+  void (*ARGB4444ToYMatrixRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
-      const uint8_t* src_argb4444, uint8_t* dst_y, int width,
+                               int width, const struct ArgbConstants* c) =
-      const struct ArgbConstants* c) = ARGB4444ToYMatrixRow_C;
+      ARGB4444ToYMatrixRow_C;
 #if defined(HAS_ARGB4444TOYMATRIXROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
@ -3831,9 +3736,11 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
  }
  for (y = 0; y < height - 1; y += 2) {
-    ARGB4444ToUVMatrixRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width, &kArgbI601Constants);
+    ARGB4444ToUVMatrixRow(src_argb4444, src_stride_argb4444, dst_u, dst_v,
                          width, &kArgbI601Constants);
    ARGB4444ToYMatrixRow(src_argb4444, dst_y, width, &kArgbI601Constants);
-    ARGB4444ToYMatrixRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, width, &kArgbI601Constants);
+    ARGB4444ToYMatrixRow(src_argb4444 + src_stride_argb4444,
                         dst_y + dst_stride_y, width, &kArgbI601Constants);
    src_argb4444 += src_stride_argb4444 * 2;
    dst_y += dst_stride_y * 2;
    dst_u += dst_stride_u;
@ -3841,18 +3748,15 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
  }
  if (height & 1) {
    ARGB4444ToYMatrixRow(src_argb4444, dst_y, width, &kArgbI601Constants);
-    ARGB4444ToUVMatrixRow(src_argb4444, 0, dst_u, dst_v, width, &kArgbI601Constants);
+    ARGB4444ToUVMatrixRow(src_argb4444, 0, dst_u, dst_v, width,
                          &kArgbI601Constants);
  }
  return 0;
 }
 // Convert RGB24 to J400.
 LIBYUV_API
-int RGB24ToJ400(const uint8_t* src_rgb24,
+int RGB24ToJ400(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_yj,
-                int src_stride_rgb24,
+                int dst_stride_yj, int width, int height) {
                uint8_t* dst_yj,
                int dst_stride_yj,
                int width,
                int height) {
  int y;
  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
      RGB24ToARGBRow_C;
@ -3993,7 +3897,7 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
    RGB24ToARGBRow = RGB24ToARGBRow_RVV;
  }
 #endif
-{
+  {
    // Allocate 1 row of ARGB.
    const int row_size = (width * 4 + 31) & ~31;
    align_buffer_64(row, row_size);
@ -4165,8 +4069,7 @@ int RAWToJ400(const uint8_t* src_raw,
    // Allocate 1 row of ARGB.
    const int row_size = (width * 4 + 31) & ~31;
    align_buffer_64(row, row_size);
-    if (!row)
+    if (!row) return 1;
      return 1;
    for (y = 0; y < height; ++y) {
      RAWToARGBRow(src_raw, row, width);
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -5098,8 +5098,7 @@ int Android420ToARGBMatrix(const uint8_t* src_y,
  // General case fallback creates NV12
  const uint64_t uv_size = (uint64_t)halfwidth * 2 * halfheight;
-  if (uv_size > SIZE_MAX)
+  if (uv_size > SIZE_MAX) return 1;
    return 1;
  align_buffer_64(plane_uv, (size_t)uv_size);
  if (!plane_uv)
    return 1;
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@ -35,8 +35,8 @@ int ARGBToI444(const uint8_t* src_argb,
               int width,
               int height) {
  return ARGBToI444Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kArgbI601Constants,
+                          dst_stride_u, dst_v, dst_stride_v,
-                          width, height);
+                          &kArgbI601Constants, width, height);
 }
 LIBYUV_API
@ -188,8 +188,8 @@ int ARGBToI422(const uint8_t* src_argb,
               int width,
               int height) {
  return ARGBToI422Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kArgbI601Constants,
+                          dst_stride_u, dst_v, dst_stride_v,
-                          width, height);
+                          &kArgbI601Constants, width, height);
 }
 LIBYUV_API
@ -275,34 +275,34 @@ ARGBToUVMatrixRow_C;
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-      if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-    if (TestCpuFlag(kCpuHasSVE2)) {
+  if (TestCpuFlag(kCpuHasSVE2)) {
-      if (IS_ALIGNED(width, 2)) {
+    if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SME)
-    if (TestCpuFlag(kCpuHasSME)) {
+  if (TestCpuFlag(kCpuHasSME)) {
-      if (IS_ALIGNED(width, 2)) {
+    if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
@ -445,34 +445,34 @@ ARGBToUVMatrixRow_C;
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-      if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-    if (TestCpuFlag(kCpuHasSVE2)) {
+  if (TestCpuFlag(kCpuHasSVE2)) {
-      if (IS_ALIGNED(width, 2)) {
+    if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SME)
-    if (TestCpuFlag(kCpuHasSME)) {
+  if (TestCpuFlag(kCpuHasSME)) {
-      if (IS_ALIGNED(width, 2)) {
+    if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
@ -579,14 +579,10 @@ ARGBToUVMatrixRow_C;
  return 0;
 }
-int ARGBToNV21Matrix(const uint8_t* src_argb,
+int ARGBToNV21Matrix(const uint8_t* src_argb, int src_stride_argb,
-                     int src_stride_argb,
+                     uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu,
                     uint8_t* dst_y,
                     int dst_stride_y,
                     uint8_t* dst_vu,
                     int dst_stride_uv,
-                     const struct ArgbConstants* argbconstants,
+                     const struct ArgbConstants* argbconstants, int width,
                     int width,
                     int height) {
  int y;
  int halfwidth = (width + 1) >> 1;
@ -595,7 +591,7 @@ int ARGBToNV21Matrix(const uint8_t* src_argb,
  void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                            uint8_t* dst_u, uint8_t* dst_v, int width,
                            const struct ArgbConstants* c) =
-ARGBToUVMatrixRow_C;
+      ARGBToUVMatrixRow_C;
 #if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
@ -660,34 +656,34 @@ ARGBToUVMatrixRow_C;
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-      if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-    if (TestCpuFlag(kCpuHasSVE2)) {
+  if (TestCpuFlag(kCpuHasSVE2)) {
-      if (IS_ALIGNED(width, 2)) {
+    if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SME)
-    if (TestCpuFlag(kCpuHasSME)) {
+  if (TestCpuFlag(kCpuHasSME)) {
-      if (IS_ALIGNED(width, 2)) {
+    if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
@ -771,8 +767,7 @@ ARGBToUVMatrixRow_C;
  // Allocate a rows of uv.
  align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
  uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
-  if (!row_u)
+  if (!row_u) return 1;
    return 1;
  for (y = 0; y < height - 1; y += 2) {
    ARGBToUVMatrixRow(src_argb, src_stride_argb, row_u, row_v, width,
@ -780,7 +775,7 @@ ARGBToUVMatrixRow_C;
    MergeUVRow(row_u, row_v, dst_vu, halfwidth);
    ARGBToYMatrixRow(src_argb, dst_y, width, argbconstants);
    ARGBToYMatrixRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width,
-                      argbconstants);
+                     argbconstants);
    src_argb += src_stride_argb * 2;
    dst_y += dst_stride_y * 2;
    dst_vu += dst_stride_uv;
@ -794,12 +789,9 @@ ARGBToUVMatrixRow_C;
  return 0;
 }
 LIBYUV_API
-int ARGBToI400Matrix(const uint8_t* src_argb,
+int ARGBToI400Matrix(const uint8_t* src_argb, int src_stride_argb,
-                     int src_stride_argb,
+                     uint8_t* dst_y, int dst_stride_y,
-                     uint8_t* dst_y,
+                     const struct ArgbConstants* constants, int width,
                     int dst_stride_y,
                     const struct ArgbConstants* constants,
                     int width,
                     int height) {
  int y;
  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
@ -854,17 +846,15 @@ int ARGBToI400Matrix(const uint8_t* src_argb,
  return 0;
 }
 LIBYUV_API
-int ARGBToYUY2Matrix(const uint8_t* src_argb,
+int ARGBToYUY2Matrix(const uint8_t* src_argb, int src_stride_argb,
-                     int src_stride_argb,
+                     uint8_t* dst_yuy2, int dst_stride_yuy2,
-                     uint8_t* dst_yuy2,
+                     const struct ArgbConstants* constants, int width,
                     int dst_stride_yuy2,
                     const struct ArgbConstants* constants,
                     int width,
                     int height) {
  int y;
  void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                            uint8_t* dst_u, uint8_t* dst_v, int width,
-                            const struct ArgbConstants* c) = ARGBToUVMatrixRow_C;
+                            const struct ArgbConstants* c) =
      ARGBToUVMatrixRow_C;
  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
@ -966,17 +956,15 @@ int ARGBToYUY2Matrix(const uint8_t* src_argb,
 }
 LIBYUV_API
-int ARGBToUYVYMatrix(const uint8_t* src_argb,
+int ARGBToUYVYMatrix(const uint8_t* src_argb, int src_stride_argb,
-                     int src_stride_argb,
+                     uint8_t* dst_uyvy, int dst_stride_uyvy,
-                     uint8_t* dst_uyvy,
+                     const struct ArgbConstants* constants, int width,
                     int dst_stride_uyvy,
                     const struct ArgbConstants* constants,
                     int width,
                     int height) {
  int y;
  void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                            uint8_t* dst_u, uint8_t* dst_v, int width,
-                            const struct ArgbConstants* c) = ARGBToUVMatrixRow_C;
+                            const struct ArgbConstants* c) =
      ARGBToUVMatrixRow_C;
  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
@ -1077,69 +1065,47 @@ int ARGBToUYVYMatrix(const uint8_t* src_argb,
  return 0;
 }
 // Same as NV12 but U and V swapped.
 LIBYUV_API
-int ARGBToNV21(const uint8_t* src_argb,
+int ARGBToNV21(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y,
-               int src_stride_argb,
+               int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width,
               uint8_t* dst_y,
               int dst_stride_y,
               uint8_t* dst_vu,
               int dst_stride_vu,
               int width,
               int height) {
-  return ARGBToNV21Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_vu,
+  return ARGBToNV21Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y,
-                          dst_stride_vu, &kArgbI601Constants, width, height);
+                          dst_vu, dst_stride_vu, &kArgbI601Constants, width,
                          height);
 }
 LIBYUV_API
-int ABGRToNV12(const uint8_t* src_abgr,
+int ABGRToNV12(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_y,
-               int src_stride_abgr,
+               int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width,
               uint8_t* dst_y,
               int dst_stride_y,
               uint8_t* dst_uv,
               int dst_stride_uv,
               int width,
               int height) {
-  return ARGBToNV12Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_uv,
+  return ARGBToNV12Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y,
-                          dst_stride_uv, &kAbgrI601Constants, width, height);
+                          dst_uv, dst_stride_uv, &kAbgrI601Constants, width,
                          height);
 }
 // Same as NV12 but U and V swapped.
 LIBYUV_API
-int ABGRToNV21(const uint8_t* src_abgr,
+int ABGRToNV21(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_y,
-               int src_stride_abgr,
+               int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width,
               uint8_t* dst_y,
               int dst_stride_y,
               uint8_t* dst_vu,
               int dst_stride_vu,
               int width,
               int height) {
-  return ARGBToNV21Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_vu,
+  return ARGBToNV21Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y,
-                          dst_stride_vu, &kAbgrI601Constants, width, height);
+                          dst_vu, dst_stride_vu, &kAbgrI601Constants, width,
                          height);
 }
 // Convert ARGB to YUY2.
 LIBYUV_API
-int ARGBToYUY2(const uint8_t* src_argb,
+int ARGBToYUY2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yuy2,
-               int src_stride_argb,
+               int dst_stride_yuy2, int width, int height) {
               uint8_t* dst_yuy2,
               int dst_stride_yuy2,
               int width,
               int height) {
  return ARGBToYUY2Matrix(src_argb, src_stride_argb, dst_yuy2, dst_stride_yuy2,
                          &kArgbI601Constants, width, height);
 }
 // Convert ARGB to UYVY.
 LIBYUV_API
-int ARGBToUYVY(const uint8_t* src_argb,
+int ARGBToUYVY(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_uyvy,
-               int src_stride_argb,
+               int dst_stride_uyvy, int width, int height) {
               uint8_t* dst_uyvy,
               int dst_stride_uyvy,
               int width,
               int height) {
  return ARGBToUYVYMatrix(src_argb, src_stride_argb, dst_uyvy, dst_stride_uyvy,
                          &kArgbI601Constants, width, height);
 }
@ -1808,63 +1774,38 @@ int ARGBToAR30(const uint8_t* src_argb,
 // ARGB little endian (bgra in memory) to J444
 LIBYUV_API
-int ARGBToJ444(const uint8_t* src_argb,
+int ARGBToJ444(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y,
-               int src_stride_argb,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height) {
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height) {
  return ARGBToI444Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kArgbJPEGConstants,
+                          dst_stride_u, dst_v, dst_stride_v,
-                          width, height);
+                          &kArgbJPEGConstants, width, height);
 }
 // Convert ARGB to J420. (JPeg full range I420).
 LIBYUV_API
-int ARGBToJ420(const uint8_t* src_argb,
+int ARGBToJ420(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y,
-               int src_stride_argb,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height) {
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height) {
  return ARGBToI420Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kArgbJPEGConstants,
+                          dst_stride_u, dst_v, dst_stride_v,
-                          width, height);
+                          &kArgbJPEGConstants, width, height);
 }
 // Convert ARGB to J422. (JPeg full range I422).
 LIBYUV_API
-int ARGBToJ422(const uint8_t* src_argb,
+int ARGBToJ422(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y,
-               int src_stride_argb,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height) {
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height) {
  return ARGBToI422Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kArgbJPEGConstants,
+                          dst_stride_u, dst_v, dst_stride_v,
-                          width, height);
+                          &kArgbJPEGConstants, width, height);
 }
 // Convert ARGB to J400.
 LIBYUV_API
-int ARGBToJ400(const uint8_t* src_argb,
+int ARGBToJ400(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y,
-               int src_stride_argb,
+               int dst_stride_y, int width, int height) {
               uint8_t* dst_y,
               int dst_stride_y,
               int width,
               int height) {
  return ARGBToI400Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y,
                          &kArgbJPEGConstants, width, height);
 }
@ -1967,36 +1908,22 @@ int RGBAToJ400(const uint8_t* src_rgba,
 // Convert ABGR to J420. (JPeg full range I420).
 LIBYUV_API
-int ABGRToJ420(const uint8_t* src_abgr,
+int ABGRToJ420(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_y,
-               int src_stride_abgr,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height) {
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height) {
  return ARGBToI420Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kAbgrJPEGConstants,
+                          dst_stride_u, dst_v, dst_stride_v,
-                          width, height);
+                          &kAbgrJPEGConstants, width, height);
 }
 // Convert ABGR to J422. (JPeg full range I422).
 LIBYUV_API
-int ABGRToJ422(const uint8_t* src_abgr,
+int ABGRToJ422(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_y,
-               int src_stride_abgr,
+               int dst_stride_y, uint8_t* dst_u, int dst_stride_u,
-               uint8_t* dst_y,
+               uint8_t* dst_v, int dst_stride_v, int width, int height) {
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height) {
  return ARGBToI422Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kAbgrJPEGConstants,
+                          dst_stride_u, dst_v, dst_stride_v,
-                          width, height);
+                          &kAbgrJPEGConstants, width, height);
 }
 // Convert ABGR to J400.
@ -2298,34 +2225,34 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-      if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-    if (TestCpuFlag(kCpuHasSVE2)) {
+  if (TestCpuFlag(kCpuHasSVE2)) {
-      if (IS_ALIGNED(width, 2)) {
+    if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SME)
-    if (TestCpuFlag(kCpuHasSME)) {
+  if (TestCpuFlag(kCpuHasSME)) {
-      if (IS_ALIGNED(width, 2)) {
+    if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
      }
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -8,13 +8,13 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/convert_from_argb.h"  // For ArgbConstants
 #include "libyuv/planar_functions.h"
 #include <assert.h>
 #include <limits.h>
 #include <string.h>  // for memset()
 #include "libyuv/convert_from_argb.h"  // For ArgbConstants
 #include "libyuv/cpu_id.h"
 #include "libyuv/row.h"
 #include "libyuv/scale_row.h"  // for ScaleRowDown2
@ -630,6 +630,14 @@ void SplitUVPlane(const uint8_t* src_uv,
    }
  }
 #endif
 #if defined(HAS_SPLITUVROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW)) {
    SplitUVRow = SplitUVRow_Any_AVX512BW;
    if (IS_ALIGNED(width, 64)) {
      SplitUVRow = SplitUVRow_AVX512BW;
    }
  }
 #endif
 #if defined(HAS_SPLITUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    SplitUVRow = SplitUVRow_Any_NEON;
@ -2588,6 +2596,14 @@ void MirrorPlane(const uint8_t* src_y,
    }
  }
 #endif
 #if defined(HAS_MIRRORROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW)) {
    MirrorRow = MirrorRow_Any_AVX512BW;
    if (IS_ALIGNED(width, 64)) {
      MirrorRow = MirrorRow_AVX512BW;
    }
  }
 #endif
 #if defined(HAS_MIRRORROW_LSX)
  if (TestCpuFlag(kCpuHasLSX)) {
    MirrorRow = MirrorRow_Any_LSX;
--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -8,11 +8,11 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/rotate.h"
 #include <assert.h>
 #include <limits.h>
 #include "libyuv/rotate.h"
 #include "libyuv/convert.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
@ -403,6 +403,11 @@ void SplitRotateUV180(const uint8_t* src,
    MirrorSplitUVRow = MirrorSplitUVRow_AVX2;
  }
 #endif
 #if defined(HAS_MIRRORSPLITUVROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW) && IS_ALIGNED(width, 32)) {
    MirrorSplitUVRow = MirrorSplitUVRow_AVX512BW;
  }
 #endif
 #if defined(HAS_MIRRORSPLITUVROW_LSX)
  if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 32)) {
    MirrorSplitUVRow = MirrorSplitUVRow_LSX;
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -1500,12 +1500,8 @@ ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
 ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_AVX512BW
-ANY11P(ARGBShuffleRow_Any_AVX512BW,
+ANY11P(ARGBShuffleRow_Any_AVX512BW, ARGBShuffleRow_AVX512BW, const uint8_t*, 4,
-       ARGBShuffleRow_AVX512BW,
+       4, 31)
       const uint8_t*,
       4,
       4,
       31)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_NEON
 ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
@ -1849,13 +1845,8 @@ ANY11I(InterpolateRow_16_Any_NEON,
       7)
 #endif
 #ifdef HAS_INTERPOLATEROW_16_AVX2
-ANY11I(InterpolateRow_16_Any_AVX2,
+ANY11I(InterpolateRow_16_Any_AVX2, InterpolateRow_16_AVX2, uint16_t, uint16_t,
-       InterpolateRow_16_AVX2,
+       1, 1, 15)
       uint16_t,
       uint16_t,
       1,
       1,
       15)
 #endif
 #undef ANY11I
@ -1919,6 +1910,9 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2,
    memcpy(dst_ptr + np * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \
  }
 #ifdef HAS_MIRRORROW_AVX512BW
 ANY11M(MirrorRow_Any_AVX512BW, MirrorRow_AVX512BW, 1, 63)
 #endif
 #ifdef HAS_MIRRORROW_AVX2
 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
 #endif
@ -2022,6 +2016,9 @@ ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3)
 #ifdef HAS_SPLITUVROW_SSE2
 ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
 #endif
 #ifdef HAS_SPLITUVROW_AVX512BW
 ANY12(SplitUVRow_Any_AVX512BW, SplitUVRow_AVX512BW, 0, 2, 0, 63)
 #endif
 #ifdef HAS_SPLITUVROW_AVX2
 ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
 #endif
@ -2291,6 +2288,9 @@ ANY12MS(ARGB4444ToUVMatrixRow_Any_AVX2, ARGB4444ToUVMatrixRow_AVX2, 0, 2, 31)
 #ifdef HAS_ARGBTOUVMATRIXROW_AVX512BW
 ANY12MS(ARGBToUVMatrixRow_Any_AVX512BW, ARGBToUVMatrixRow_AVX512BW, 0, 4, 63)
 #endif
 #ifdef HAS_RGBTOUVMATRIXROW_AVX512BW
 ANY12MS(RGBToUVMatrixRow_Any_AVX512BW, RGBToUVMatrixRow_AVX512BW, 0, 3, 63)
 #endif
 #ifdef HAS_ARGBTOUVMATRIXROW_SSSE3
 ANY12MS(ARGBToUVMatrixRow_Any_SSSE3, ARGBToUVMatrixRow_SSSE3, 0, 4, 7)
 #endif
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -749,31 +749,25 @@ MAKEROWYJ(ABGR, 0, 1, 2, 4)
 MAKEROWYJ(RGBA, 3, 2, 1, 4)
 #undef MAKEROWYJ
-static __inline uint8_t RGBToYMatrix(uint8_t b0,
+static __inline uint8_t RGBToYMatrix(uint8_t b0, uint8_t b1, uint8_t b2,
                                     uint8_t b1,
                                     uint8_t b2,
                                     uint8_t b3,
                                     const struct ArgbConstants* c) {
  return (c->kRGBToY[0] * b0 + c->kRGBToY[1] * b1 + c->kRGBToY[2] * b2 +
          c->kRGBToY[3] * b3 + c->kAddY[0]) >>
         8;
 }
-static __inline uint8_t RGBToUMatrix(uint8_t b0,
+static __inline uint8_t RGBToUMatrix(uint8_t b0, uint8_t b1, uint8_t b2,
                                     uint8_t b1,
                                     uint8_t b2,
                                     uint8_t b3,
                                     const struct ArgbConstants* c) {
  return (c->kAddUV[0] - (c->kRGBToU[0] * b0 + c->kRGBToU[1] * b1 +
-                         c->kRGBToU[2] * b2 + c->kRGBToU[3] * b3)) >>
+                          c->kRGBToU[2] * b2 + c->kRGBToU[3] * b3)) >>
         8;
 }
-static __inline uint8_t RGBToVMatrix(uint8_t b0,
+static __inline uint8_t RGBToVMatrix(uint8_t b0, uint8_t b1, uint8_t b2,
                                     uint8_t b1,
                                     uint8_t b2,
                                     uint8_t b3,
                                     const struct ArgbConstants* c) {
  return (c->kAddUV[0] - (c->kRGBToV[0] * b0 + c->kRGBToV[1] * b1 +
-                         c->kRGBToV[2] * b2 + c->kRGBToV[3] * b3)) >>
+                          c->kRGBToV[2] * b2 + c->kRGBToV[3] * b3)) >>
         8;
 }
@ -783,7 +777,8 @@ void ARGBToYMatrixRow_C(const uint8_t* src_argb,
                        const struct ArgbConstants* c) {
  int x;
  for (x = 0; x < width; ++x) {
-    dst_y[0] = RGBToYMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
+    dst_y[0] =
        RGBToYMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
    src_argb += 4;
    dst_y += 1;
  }
@ -1513,18 +1508,18 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
      YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
-#define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV)   \
+#define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV) \
-  extern const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) =     \
+  extern const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) =   \
-      ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV),   \
+      ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV), \
-                        -(RV), 0, AY, AUV);                                    \
+                        -(RV), 0, AY, AUV);                                  \
-  extern const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) =     \
+  extern const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) =   \
-      ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV),   \
+      ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV), \
-                        -(BV), 0, AY, AUV);                                    \
+                        -(BV), 0, AY, AUV);                                  \
-  extern const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) =     \
+  extern const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) =   \
-      ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV),       \
+      ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV),     \
-                        -(GV), -(RV), AY, AUV);                                \
+                        -(GV), -(RV), AY, AUV);                              \
-  extern const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) =     \
+  extern const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) =   \
-      ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV),       \
+      ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV),     \
                        -(GV), -(BV), AY, AUV);
 // BT.601 limited range RGB to YUV coefficients
@ -4556,9 +4551,7 @@ void HalfMergeUVRow_C(const uint8_t* src_u,
 #undef STATIC_CAST
-void RGBToYMatrixRow_C(const uint8_t* src_rgb,
+void RGBToYMatrixRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width,
                       uint8_t* dst_y,
                       int width,
                       const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
  while (width > 0) {
@ -4571,11 +4564,8 @@ void RGBToYMatrixRow_C(const uint8_t* src_rgb,
  }
 }
-void RGBToUVMatrixRow_C(const uint8_t* src_rgb,
+void RGBToUVMatrixRow_C(const uint8_t* src_rgb, int src_stride_rgb,
-                        int src_stride_rgb,
+                        uint8_t* dst_u, uint8_t* dst_v, int width,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width,
                        const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
  while (width > 0) {
@ -4591,9 +4581,7 @@ void RGBToUVMatrixRow_C(const uint8_t* src_rgb,
 }
 #if defined(HAS_ARGBTOYMATRIXROW_AVX2) && defined(HAS_RGB24TOARGBROW_AVX2)
-void RGBToYMatrixRow_AVX2(const uint8_t* src_rgb,
+void RGBToYMatrixRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_y, int width,
                          uint8_t* dst_y,
                          int width,
                          const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
  while (width > 0) {
@ -4608,18 +4596,14 @@ void RGBToYMatrixRow_AVX2(const uint8_t* src_rgb,
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_AVX2) && defined(HAS_RGB24TOARGBROW_AVX2)
-void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb,
+void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb, int src_stride_rgb,
-                           int src_stride_rgb,
+                           uint8_t* dst_u, uint8_t* dst_v, int width,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width,
                           const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    RGB24ToARGBRow_AVX2(src_rgb, row, twidth);
-    RGB24ToARGBRow_AVX2(src_rgb + src_stride_rgb,
+    RGB24ToARGBRow_AVX2(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4, twidth);
                        row + MAXTWIDTH * 4, twidth);
    ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
    src_rgb += twidth * 3;
    dst_u += twidth / 2;
@ -4629,12 +4613,29 @@ void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb,
 }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW) && \
    defined(HAS_RGB24TOARGBROW_AVX512BW)
 void RGBToUVMatrixRow_AVX512BW(const uint8_t* src_rgb, int src_stride_rgb,
                               uint8_t* dst_u, uint8_t* dst_v, int width,
                               const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    RGB24ToARGBRow_AVX512BW(src_rgb, row, twidth);
    RGB24ToARGBRow_AVX512BW(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4,
                            twidth);
    ARGBToUVMatrixRow_AVX512BW(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
    src_rgb += twidth * 3;
    dst_u += twidth / 2;
    dst_v += twidth / 2;
    width -= twidth;
  }
 }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON) && defined(HAS_RGB24TOARGBROW_NEON)
-void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb,
+void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb, int src_stride_rgb,
-                           int src_stride_rgb,
+                           uint8_t* dst_u, uint8_t* dst_v, int width,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width,
                           const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
  while (width > 0) {
@ -4650,9 +4651,7 @@ void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb,
 }
 #endif
-void RGB565ToYMatrixRow_C(const uint8_t* src_rgb565,
+void RGB565ToYMatrixRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width,
                          uint8_t* dst_y,
                          int width,
                          const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
  while (width > 0) {
@ -4665,17 +4664,15 @@ void RGB565ToYMatrixRow_C(const uint8_t* src_rgb565,
  }
 }
-void RGB565ToUVMatrixRow_C(const uint8_t* src_rgb565,
+void RGB565ToUVMatrixRow_C(const uint8_t* src_rgb565, int src_stride_rgb565,
-                           int src_stride_rgb565,
+                           uint8_t* dst_u, uint8_t* dst_v, int width,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width,
                           const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    RGB565ToARGBRow_C(src_rgb565, row, twidth);
-    RGB565ToARGBRow_C(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4, twidth);
+    RGB565ToARGBRow_C(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4,
                      twidth);
    ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
    src_rgb565 += twidth * 2;
    dst_u += twidth / 2;
@ -4685,10 +4682,8 @@ void RGB565ToUVMatrixRow_C(const uint8_t* src_rgb565,
 }
 #if defined(HAS_ARGBTOYMATRIXROW_AVX2) && defined(HAS_RGB565TOARGBROW_AVX2)
-void RGB565ToYMatrixRow_AVX2(const uint8_t* src_rgb565,
+void RGB565ToYMatrixRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_y,
-                             uint8_t* dst_y,
+                             int width, const struct ArgbConstants* c) {
                             int width,
                             const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
@ -4702,18 +4697,15 @@ void RGB565ToYMatrixRow_AVX2(const uint8_t* src_rgb565,
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_AVX2) && defined(HAS_RGB565TOARGBROW_AVX2)
-void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565,
+void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565, int src_stride_rgb565,
-                              int src_stride_rgb565,
+                              uint8_t* dst_u, uint8_t* dst_v, int width,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
                              int width,
                              const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    RGB565ToARGBRow_AVX2(src_rgb565, row, twidth);
-    RGB565ToARGBRow_AVX2(src_rgb565 + src_stride_rgb565,
+    RGB565ToARGBRow_AVX2(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4,
-                         row + MAXTWIDTH * 4, twidth);
+                         twidth);
    ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
    src_rgb565 += twidth * 2;
    dst_u += twidth / 2;
@ -4724,10 +4716,8 @@ void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565,
 #endif
 #if defined(HAS_RGB565TOARGBROW_NEON) && defined(HAS_ARGBTOYMATRIXROW_NEON)
-void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565,
+void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y,
-                             uint8_t* dst_y,
+                             int width, const struct ArgbConstants* c) {
                             int width,
                             const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
@ -4741,17 +4731,15 @@ void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565,
 #endif
 #if defined(HAS_RGB565TOARGBROW_NEON) && defined(HAS_ARGBTOUVMATRIXROW_NEON)
-void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565,
+void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565, int src_stride_rgb565,
-                              int src_stride_rgb565,
+                              uint8_t* dst_u, uint8_t* dst_v, int width,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
                              int width,
                              const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    RGB565ToARGBRow_NEON(src_rgb565, row, twidth);
-    RGB565ToARGBRow_NEON(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4, twidth);
+    RGB565ToARGBRow_NEON(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4,
                         twidth);
    ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
    src_rgb565 += twidth * 2;
    dst_u += twidth / 2;
@ -4761,10 +4749,8 @@ void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565,
 }
 #endif
-void ARGB1555ToYMatrixRow_C(const uint8_t* src_argb1555,
+void ARGB1555ToYMatrixRow_C(const uint8_t* src_argb1555, uint8_t* dst_y,
-                            uint8_t* dst_y,
+                            int width, const struct ArgbConstants* c) {
                            int width,
                            const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
@ -4777,16 +4763,15 @@ void ARGB1555ToYMatrixRow_C(const uint8_t* src_argb1555,
 }
 void ARGB1555ToUVMatrixRow_C(const uint8_t* src_argb1555,
-                             int src_stride_argb1555,
+                             int src_stride_argb1555, uint8_t* dst_u,
-                             uint8_t* dst_u,
+                             uint8_t* dst_v, int width,
                             uint8_t* dst_v,
                             int width,
                             const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    ARGB1555ToARGBRow_C(src_argb1555, row, twidth);
-    ARGB1555ToARGBRow_C(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4, twidth);
+    ARGB1555ToARGBRow_C(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4,
                        twidth);
    ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
    src_argb1555 += twidth * 2;
    dst_u += twidth / 2;
@ -4795,10 +4780,8 @@ void ARGB1555ToUVMatrixRow_C(const uint8_t* src_argb1555,
  }
 }
-void ARGB4444ToYMatrixRow_C(const uint8_t* src_argb4444,
+void ARGB4444ToYMatrixRow_C(const uint8_t* src_argb4444, uint8_t* dst_y,
-                            uint8_t* dst_y,
+                            int width, const struct ArgbConstants* c) {
                            int width,
                            const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
@ -4811,16 +4794,15 @@ void ARGB4444ToYMatrixRow_C(const uint8_t* src_argb4444,
 }
 void ARGB4444ToUVMatrixRow_C(const uint8_t* src_argb4444,
-                             int src_stride_argb4444,
+                             int src_stride_argb4444, uint8_t* dst_u,
-                             uint8_t* dst_u,
+                             uint8_t* dst_v, int width,
                             uint8_t* dst_v,
                             int width,
                             const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    ARGB4444ToARGBRow_C(src_argb4444, row, twidth);
-    ARGB4444ToARGBRow_C(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4, twidth);
+    ARGB4444ToARGBRow_C(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4,
                        twidth);
    ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
    src_argb4444 += twidth * 2;
    dst_u += twidth / 2;
@ -4831,10 +4813,8 @@ void ARGB4444ToUVMatrixRow_C(const uint8_t* src_argb4444,
 #if defined(HAS_ARGBTOYMATRIXROW_AVX2)
 #if defined(HAS_ARGB1555TOARGBROW_AVX2)
-void ARGB1555ToYMatrixRow_AVX2(const uint8_t* src_argb1555,
+void ARGB1555ToYMatrixRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_y,
-                               uint8_t* dst_y,
+                               int width, const struct ArgbConstants* c) {
                               int width,
                               const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
@ -4848,10 +4828,8 @@ void ARGB1555ToYMatrixRow_AVX2(const uint8_t* src_argb1555,
 #endif
 #if defined(HAS_ARGB4444TOARGBROW_AVX2)
-void ARGB4444ToYMatrixRow_AVX2(const uint8_t* src_argb4444,
+void ARGB4444ToYMatrixRow_AVX2(const uint8_t* src_argb4444, uint8_t* dst_y,
-                               uint8_t* dst_y,
+                               int width, const struct ArgbConstants* c) {
                               int width,
                               const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
@ -4868,10 +4846,8 @@ void ARGB4444ToYMatrixRow_AVX2(const uint8_t* src_argb4444,
 #if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
 #if defined(HAS_ARGB1555TOARGBROW_AVX2)
 void ARGB1555ToUVMatrixRow_AVX2(const uint8_t* src_argb1555,
-                                int src_stride_argb1555,
+                                int src_stride_argb1555, uint8_t* dst_u,
-                                uint8_t* dst_u,
+                                uint8_t* dst_v, int width,
                                uint8_t* dst_v,
                                int width,
                                const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
  while (width > 0) {
@ -4890,10 +4866,8 @@ void ARGB1555ToUVMatrixRow_AVX2(const uint8_t* src_argb1555,
 #if defined(HAS_ARGB4444TOARGBROW_AVX2)
 void ARGB4444ToUVMatrixRow_AVX2(const uint8_t* src_argb4444,
-                                int src_stride_argb4444,
+                                int src_stride_argb4444, uint8_t* dst_u,
-                                uint8_t* dst_u,
+                                uint8_t* dst_v, int width,
                                uint8_t* dst_v,
                                int width,
                                const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
  while (width > 0) {
@ -4912,10 +4886,8 @@ void ARGB4444ToUVMatrixRow_AVX2(const uint8_t* src_argb4444,
 #endif
 #if defined(HAS_ARGBTOYMATRIXROW_NEON) && defined(HAS_ARGB1555TOARGBROW_NEON)
-void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555,
+void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y,
-                               uint8_t* dst_y,
+                               int width, const struct ArgbConstants* c) {
                               int width,
                               const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
@ -4929,10 +4901,8 @@ void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555,
 #endif
 #if defined(HAS_ARGBTOYMATRIXROW_NEON) && defined(HAS_ARGB4444TOARGBROW_NEON)
-void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444,
+void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y,
-                               uint8_t* dst_y,
+                               int width, const struct ArgbConstants* c) {
                               int width,
                               const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
@ -4947,16 +4917,15 @@ void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444,
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON) && defined(HAS_ARGB1555TOARGBROW_NEON)
 void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555,
-                                int src_stride_argb1555,
+                                int src_stride_argb1555, uint8_t* dst_u,
-                                uint8_t* dst_u,
+                                uint8_t* dst_v, int width,
                                uint8_t* dst_v,
                                int width,
                                const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    ARGB1555ToARGBRow_NEON(src_argb1555, row, twidth);
-    ARGB1555ToARGBRow_NEON(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4, twidth);
+    ARGB1555ToARGBRow_NEON(src_argb1555 + src_stride_argb1555,
                           row + MAXTWIDTH * 4, twidth);
    ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
    src_argb1555 += twidth * 2;
    dst_u += twidth / 2;
@ -4968,16 +4937,15 @@ void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555,
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON) && defined(HAS_ARGB4444TOARGBROW_NEON)
 void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444,
-                                int src_stride_argb4444,
+                                int src_stride_argb4444, uint8_t* dst_u,
-                                uint8_t* dst_u,
+                                uint8_t* dst_v, int width,
                                uint8_t* dst_v,
                                int width,
                                const struct ArgbConstants* c) {
  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    ARGB4444ToARGBRow_NEON(src_argb4444, row, twidth);
-    ARGB4444ToARGBRow_NEON(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4, twidth);
+    ARGB4444ToARGBRow_NEON(src_argb4444 + src_stride_argb4444,
                           row + MAXTWIDTH * 4, twidth);
    ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
    src_argb4444 += twidth * 2;
    dst_u += twidth / 2;
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -120,11 +120,11 @@ static const lvec8 kShuffleNV21 = {
 #if defined(HAS_J400TOARGBROW_AVX2) || defined(HAS_J400TOARGBROW_AVX512BW)
 alignas(64) static const uint8_t kShuffleMaskJ400ToARGB[64] = {
-    0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, 3u, 128u,
+    0u,  0u,   0u,  128u, 1u,  1u,   1u,  128u, 2u,  2u,   2u,  128u, 3u,  3u,
-    4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u,
+    3u,  128u, 4u,  4u,   4u,  128u, 5u,  5u,   5u,  128u, 6u,  6u,   6u,  128u,
-    8u, 8u, 8u, 128u, 9u, 9u, 9u, 128u, 10u, 10u, 10u, 128u, 11u, 11u, 11u, 128u,
+    7u,  7u,   7u,  128u, 8u,  8u,   8u,  128u, 9u,  9u,   9u,  128u, 10u, 10u,
-    12u, 12u, 12u, 128u, 13u, 13u, 13u, 128u, 14u, 14u, 14u, 128u, 15u, 15u, 15u, 128u
+    10u, 128u, 11u, 11u,  11u, 128u, 12u, 12u,  12u, 128u, 13u, 13u,  13u, 128u,
-};
+    14u, 14u,  14u, 128u, 15u, 15u,  15u, 128u};
 #endif
 #ifdef HAS_J400TOARGBROW_AVX2
@ -149,16 +149,17 @@ void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
      "sub         $0x10,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_y),     // %0
+      : "+r"(src_y),                 // %0
-        "+r"(dst_argb),  // %1
+        "+r"(dst_argb),              // %1
-        "+r"(width)      // %2
+        "+r"(width)                  // %2
-      : "r"(kShuffleMaskJ400ToARGB) // %3
+      : "r"(kShuffleMaskJ400ToARGB)  // %3
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
 }
 #endif  // HAS_J400TOARGBROW_AVX2
 #ifdef HAS_J400TOARGBROW_AVX512BW
-void J400ToARGBRow_AVX512BW(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+void J400ToARGBRow_AVX512BW(const uint8_t* src_y, uint8_t* dst_argb,
                            int width) {
  asm volatile(
      "vpternlogd  $0xff,%%zmm7,%%zmm7,%%zmm7    \n"  // 0xffffffff
      "vpslld      $0x18,%%zmm7,%%zmm7           \n"  // 0xff000000
@ -179,10 +180,10 @@ void J400ToARGBRow_AVX512BW(const uint8_t* src_y, uint8_t* dst_argb, int width)
      "sub         $0x20,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_y),     // %0
+      : "+r"(src_y),                 // %0
-        "+r"(dst_argb),  // %1
+        "+r"(dst_argb),              // %1
-        "+r"(width)      // %2
+        "+r"(width)                  // %2
-      : "m"(kShuffleMaskJ400ToARGB) // %3
+      : "m"(kShuffleMaskJ400ToARGB)  // %3
      : "memory", "cc", "xmm0", "xmm1", "xmm5", "xmm7");
 }
 #endif  // HAS_J400TOARGBROW_AVX512BW
@ -221,15 +222,16 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
      "lea         0x40(%1),%1                   \n"
      "sub         $0x10,%2                      \n"
      "jg          1b                            \n"
-      : "+r"(src_rgb24),              // %0
+      : "+r"(src_rgb24),                 // %0
-        "+r"(dst_argb),               // %1
+        "+r"(dst_argb),                  // %1
-        "+r"(width)                   // %2
+        "+r"(width)                      // %2
      : "m"(kShuffleMaskRGB24ToARGB[0])  // %3
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #ifdef HAS_RGB24TOARGBROW_AVX2
-void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
+void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb,
                         int width) {
  // Reference to prevent discarding of kShuffleMaskRGB24ToARGB[1] which is
  // accessed via offset in assembly.
  const uvec8* dummy = &kShuffleMaskRGB24ToARGB[1];
@ -267,9 +269,9 @@ void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width)
      "sub         $0x20,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_rgb24),              // %0
+      : "+r"(src_rgb24),                 // %0
-        "+r"(dst_argb),               // %1
+        "+r"(dst_argb),                  // %1
-        "+r"(width)                   // %2
+        "+r"(width)                      // %2
      : "m"(kShuffleMaskRGB24ToARGB[0])  // %3
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
@ -399,7 +401,8 @@ void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint
        "+r"(width)                     // %2
      : "m"(kPermdRAWToARGB_AVX512BW),  // %3
        "m"(*shuffler)                  // %4
-      : "memory", "cc", "rax", "k1", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+      : "memory", "cc", "rax", "k1", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
        "xmm5", "xmm6");
 }
 void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
@ -1511,15 +1514,14 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb,
      "vpternlogd  $0xff,%%zmm16,%%zmm16,%%zmm16 \n"
      "vpsllw      $15,%%zmm16,%%zmm5            \n"
      "vpacksswb   %%zmm5,%%zmm5,%%zmm5          \n"
-      "vpsrlw      $15,%%zmm16,%%zmm16           \n" // zmm16 = 1
+      "vpsrlw      $15,%%zmm16,%%zmm16           \n"  // zmm16 = 1
      "vbroadcasti64x4 0(%3),%%zmm4              \n"
      "vbroadcasti64x4 0x60(%3),%%zmm7           \n"
      "vpmaddubsw  %%zmm5,%%zmm4,%%zmm6          \n"
      "vpmaddwd    %%zmm16,%%zmm6,%%zmm6         \n"
      "vpackssdw   %%zmm6,%%zmm6,%%zmm6          \n"
      "vpsubw      %%zmm6,%%zmm7,%%zmm7          \n"
-      "vmovups     %4,%%zmm6                     \n"
+      "vmovups     %4,%%zmm6                     \n" LABELALIGN
      LABELALIGN
      "1:          \n"
      "vmovups     (%0),%%zmm0                   \n"
      "vmovups     0x40(%0),%%zmm1               \n"
@ -1551,11 +1553,11 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb,
      "sub         $0x40,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_argb),  // %0
+      : "+r"(src_argb),              // %0
-        "+r"(dst_y),     // %1
+        "+r"(dst_y),                 // %1
-        "+r"(width)      // %2
+        "+r"(width)                  // %2
-      : "r"(c),          // %3
+      : "r"(c),                      // %3
-        "m"(kPermdARGBToY_AVX512BW) // %4
+        "m"(kPermdARGBToY_AVX512BW)  // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7", "xmm16");
 }
@ -1713,8 +1715,8 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb,
  asm volatile(
      "vbroadcasti64x4 0x20(%4),%%zmm3               \n"  // kRGBToU
      "vbroadcasti64x4 0x40(%4),%%zmm4               \n"  // kRGBToV
-      "vpternlogd  $0xff,%%zmm16,%%zmm16,%%zmm16 \n"  // -1
+      "vpternlogd  $0xff,%%zmm16,%%zmm16,%%zmm16 \n"      // -1
-      "vpsllw      $15,%%zmm16,%%zmm5            \n"  // 0x8000
+      "vpsllw      $15,%%zmm16,%%zmm5            \n"      // 0x8000
      "vmovups     %5,%%zmm7                     \n"
      "sub         %1,%2                         \n"
@ -2174,8 +2176,8 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb,
      "vbroadcasti64x4 0x20(%5),%%zmm4               \n"  // RGBToU
      "vbroadcasti64x4 0x40(%5),%%zmm5               \n"  // RGBToV
      "vpternlogd  $0xff,%%zmm16,%%zmm16,%%zmm16 \n"
-      "vpabsb      %%zmm16,%%zmm6                \n"  // 0x0101
+      "vpabsb      %%zmm16,%%zmm6                \n"      // 0x0101
-      "vpsllw      $15,%%zmm16,%%zmm17           \n"  // 0x8000
+      "vpsllw      $15,%%zmm16,%%zmm17           \n"      // 0x8000
      "vbroadcasti64x4 %6,%%zmm7                     \n"  // kShuffleAARRGGBB
      "vmovups     %7,%%zmm18                    \n"  // kPermdARGBToY_AVX512BW
      "vmovups     %8,%%zmm19                    \n"  // kPermdARGBToUV_AVX512BW
@ -2209,7 +2211,8 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb,
      "vpmaddubsw  %%zmm5,%%zmm0,%%zmm0          \n"  // 16 V
      "vpmaddwd    %%zmm16,%%zmm1,%%zmm1         \n"
      "vpmaddwd    %%zmm16,%%zmm0,%%zmm0         \n"
-      "vpackssdw   %%zmm0,%%zmm1,%%zmm0          \n"  // mutates (U in lower, V in upper)
+      "vpackssdw   %%zmm0,%%zmm1,%%zmm0          \n"  // mutates (U in lower, V
                                                      // in upper)
      "vpaddw      %%zmm17,%%zmm0,%%zmm0         \n"
      "vpsrlw      $0x8,%%zmm0,%%zmm0            \n"
      "vpackuswb   %%zmm0,%%zmm0,%%zmm0          \n"  // mutates
@ -4601,6 +4604,29 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 }
 #endif  // HAS_MIRRORROW_SSSE3
 #ifdef HAS_MIRRORROW_AVX512BW
 void MirrorRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) {
  ptrdiff_t temp_width = (ptrdiff_t)(width);
  asm volatile("vbroadcasti32x4 %3,%%zmm5                 \n"
               LABELALIGN
               "1:          \n"
               "vmovdqu8    -0x40(%0,%2,1),%%zmm0         \n"
               "vpshufb     %%zmm5,%%zmm0,%%zmm0          \n"
               "vshufi64x2  $0x1b,%%zmm0,%%zmm0,%%zmm0    \n"
               "vmovdqu8    %%zmm0,(%1)                   \n"
               "lea         0x40(%1),%1                   \n"
               "sub         $0x40,%2                      \n"
               "jg          1b                            \n"
               "vzeroupper  \n"
               : "+r"(src),           // %0
                 "+r"(dst),           // %1
                 "+r"(temp_width)     // %2
               : "m"(kShuffleMirror)  // %3
               : "memory", "cc", "zmm0", "zmm5");
 }
 #endif  // HAS_MIRRORROW_AVX512BW
 #ifdef HAS_MIRRORROW_AVX2
 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  ptrdiff_t temp_width = (ptrdiff_t)(width);
@ -4624,14 +4650,49 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 }
 #endif  // HAS_MIRRORROW_AVX2
-#ifdef HAS_MIRRORSPLITUVROW_AVX2
+#if defined(HAS_MIRRORSPLITUVROW_AVX2) || defined(HAS_MIRRORSPLITUVROW_AVX512BW)
 // Shuffle table for reversing the bytes of UV channels.
 static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
                                            15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
 #endif
-void MirrorSplitUVRow_AVX2(const uint8_t* src,
+#ifdef HAS_MIRRORSPLITUVROW_AVX512BW
-                           uint8_t* dst_u,
+static const uint64_t kMirrorSplitUVPermute[8] = {6, 4, 2, 0, 7, 5, 3, 1};
-                           uint8_t* dst_v,
+
 void MirrorSplitUVRow_AVX512BW(const uint8_t* src, uint8_t* dst_u,
                               uint8_t* dst_v, int width) {
  ptrdiff_t temp_width = (ptrdiff_t)(width);
  asm volatile(
      "vbroadcasti32x4 %4,%%zmm1                 \n"
      "lea         -0x40(%0,%3,2),%0             \n"
      "sub         %1,%2                         \n"
      "vmovdqu64   %5,%%zmm3                     \n"
      LABELALIGN
      "1:          \n"
      "vmovdqu8    (%0),%%zmm0                   \n"
      "lea         -0x40(%0),%0                  \n"
      "vpshufb     %%zmm1,%%zmm0,%%zmm0          \n"
      "vpermq      %%zmm0,%%zmm3,%%zmm0          \n"
      "vextracti64x4 $0x1,%%zmm0,%%ymm2          \n"
      "vmovdqu     %%ymm0,(%1)                   \n"
      "vmovdqu     %%ymm2,0x00(%1,%2,1)          \n"
      "lea         0x20(%1),%1                   \n"
      "sub         $0x20,%3                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
      : "+r"(src),                   // %0
        "+r"(dst_u),                 // %1
        "+r"(dst_v),                 // %2
        "+r"(temp_width)             // %3
      : "m"(kShuffleMirrorSplitUV),  // %4
        "m"(kMirrorSplitUVPermute)   // %5
      : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3");
 }
 #endif  // HAS_MIRRORSPLITUVROW_AVX512BW
 #ifdef HAS_MIRRORSPLITUVROW_AVX2
 void MirrorSplitUVRow_AVX2(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
                           int width) {
  ptrdiff_t temp_width = (ptrdiff_t)(width);
  asm volatile(
@ -4759,16 +4820,13 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
 #ifdef HAS_RGB24MIRRORROW_AVX2
 // Shuffle first 10 pixels to last 10 mirrored.  first byte zero
 static const uvec8 kShuffleMirrorRGB0_AVX = {
-    128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u
+    128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u};
 };
 // Shuffle last 2 pixels to first 2 mirrored.  last byte zero
 static const uvec8 kShuffleMirrorRGB1_AVX = {
-    13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u
+    13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
 };
-void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
+void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_rgb24,
                         uint8_t* dst_rgb24,
                         int width) {
  ptrdiff_t temp_width = (ptrdiff_t)(width);
  src_rgb24 += width * 3 - 96;
@ -4801,9 +4859,9 @@ void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
      "sub         $0x20,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_rgb24),          // %0
+      : "+r"(src_rgb24),              // %0
-        "+r"(dst_rgb24),          // %1
+        "+r"(dst_rgb24),              // %1
-        "+r"(temp_width)          // %2
+        "+r"(temp_width)              // %2
      : "m"(kShuffleMirrorRGB0_AVX),  // %3
        "m"(kShuffleMirrorRGB1_AVX)   // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
@ -4894,6 +4952,45 @@ void SplitUVRow_AVX2(const uint8_t* src_uv,
 }
 #endif  // HAS_SPLITUVROW_AVX2
 #ifdef HAS_SPLITUVROW_AVX512BW
 static const uint64_t kSplitUVPermute[8] = {0, 2, 4, 6, 1, 3, 5, 7};
 void SplitUVRow_AVX512BW(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
                         int width) {
  asm volatile(
      "vpternlogd  $0xff,%%zmm5,%%zmm5,%%zmm5    \n"
      "vpsrlw      $0x8,%%zmm5,%%zmm5            \n"
      "vmovdqu64   %4,%%zmm4                     \n"
      "sub         %1,%2                         \n"
      LABELALIGN
      "1:          \n"
      "vmovdqu8    (%0),%%zmm0                   \n"
      "vmovdqu8    0x40(%0),%%zmm1               \n"
      "lea         0x80(%0),%0                   \n"
      "vpsrlw      $0x8,%%zmm0,%%zmm2            \n"
      "vpsrlw      $0x8,%%zmm1,%%zmm3            \n"
      "vpandd      %%zmm5,%%zmm0,%%zmm0          \n"
      "vpandd      %%zmm5,%%zmm1,%%zmm1          \n"
      "vpackuswb   %%zmm1,%%zmm0,%%zmm0          \n"
      "vpackuswb   %%zmm3,%%zmm2,%%zmm2          \n"
      "vpermq      %%zmm0,%%zmm4,%%zmm0          \n"
      "vpermq      %%zmm2,%%zmm4,%%zmm2          \n"
      "vmovdqu8    %%zmm0,(%1)                   \n"
      "vmovdqu8    %%zmm2,0x00(%1,%2,1)          \n"
      "lea         0x40(%1),%1                   \n"
      "sub         $0x40,%3                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
      : "+r"(src_uv),         // %0
        "+r"(dst_u),          // %1
        "+r"(dst_v),          // %2
        "+r"(width)           // %3
      : "m"(kSplitUVPermute)  // %4
      : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5");
 }
 #endif  // HAS_SPLITUVROW_AVX512BW
 #ifdef HAS_SPLITUVROW_SSE2
 void SplitUVRow_SSE2(const uint8_t* src_uv,
                     uint8_t* dst_u,
@ -8765,10 +8862,8 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,
 #ifdef HAS_INTERPOLATEROW_16_AVX2
 // Bilinear filter 16x2 -> 16x1
-void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
+void InterpolateRow_16_AVX2(uint16_t* dst_ptr, const uint16_t* src_ptr,
-                            const uint16_t* src_ptr,
+                            ptrdiff_t src_stride, int width,
                            ptrdiff_t src_stride,
                            int width,
                            int source_y_fraction) {
  asm volatile(
      "sub         %1,%0                         \n"
@ -8783,10 +8878,14 @@ void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
      "vmovd       %3,%%xmm5                     \n"
      "vpunpcklwd  %%xmm0,%%xmm5,%%xmm5          \n"
      "vpbroadcastd %%xmm5,%%ymm5                \n"
-      "mov         $0x80008000,%%eax             \n"  // 0x80008000 used to bias unsigned words to signed range for vpmaddwd.
+      "mov         $0x80008000,%%eax             \n"  // 0x80008000 used to bias
                                                      // unsigned words to
                                                      // signed range for
                                                      // vpmaddwd.
      "vmovd       %%eax,%%xmm4                  \n"
      "vbroadcastss %%xmm4,%%ymm4                \n"
-      "mov         $8388736,%%eax                \n"  // 32768 * 256 + 128 rounding constant.
+      "mov         $8388736,%%eax                \n"  // 32768 * 256 + 128
                                                      // rounding constant.
      "vmovd       %%eax,%%xmm3                  \n"
      "vbroadcastss %%xmm3,%%ymm3                \n"
@ -8811,8 +8910,7 @@ void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
      "jg          1b                            \n"
      "jmp         99f                           \n"
-      "50:         \n"
+      "50:         \n" LABELALIGN
      LABELALIGN
      "2:          \n"
      "vmovdqu     (%1),%%ymm0                   \n"
      "vpavgw      (%1,%4,2),%%ymm0,%%ymm0       \n"
@ -8822,8 +8920,7 @@ void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
      "jg          2b                            \n"
      "jmp         99f                           \n"
-      "100:        \n"
+      "100:        \n" LABELALIGN
      LABELALIGN
      "3:          \n"
      "vmovdqu     (%1),%%ymm0                   \n"
      "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
@ -8901,31 +8998,28 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
 #ifdef HAS_ARGBSHUFFLEROW_AVX512BW
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb,
+void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_argb,
-                             uint8_t* dst_argb,
+                             const uint8_t* shuffler, int width) {
-                             const uint8_t* shuffler,
+  asm volatile("vbroadcasti32x4 (%3),%%zmm5               \n"
                             int width) {
  asm volatile(
      "vbroadcasti32x4 (%3),%%zmm5               \n"
-      LABELALIGN
+               LABELALIGN
-      "1:          \n"
+               "1:          \n"
-      "vmovdqu8    (%0),%%zmm0                   \n"
+               "vmovdqu8    (%0),%%zmm0                   \n"
-      "vmovdqu8    0x40(%0),%%zmm1               \n"
+               "vmovdqu8    0x40(%0),%%zmm1               \n"
-      "lea         0x80(%0),%0                   \n"
+               "lea         0x80(%0),%0                   \n"
-      "vpshufb     %%zmm5,%%zmm0,%%zmm0          \n"
+               "vpshufb     %%zmm5,%%zmm0,%%zmm0          \n"
-      "vpshufb     %%zmm5,%%zmm1,%%zmm1          \n"
+               "vpshufb     %%zmm5,%%zmm1,%%zmm1          \n"
-      "vmovdqu8    %%zmm0,(%1)                   \n"
+               "vmovdqu8    %%zmm0,(%1)                   \n"
-      "vmovdqu8    %%zmm1,0x40(%1)               \n"
+               "vmovdqu8    %%zmm1,0x40(%1)               \n"
-      "lea         0x80(%1),%1                   \n"
+               "lea         0x80(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
+               "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
+               "jg          1b                            \n"
-      "vzeroupper  \n"
+               "vzeroupper  \n"
-      : "+r"(src_argb),  // %0
+               : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
+                 "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
+                 "+r"(width)      // %2
-      : "r"(shuffler)    // %3
+               : "r"(shuffler)    // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+               : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX512BW
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -1887,13 +1887,13 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels U.
      "vst1.8      {d1}, [%2]!                   \n"  // store 8 pixels V.
      "bgt         1b                            \n"
-      : "+r"(src_argb),     // %0
+      : "+r"(src_argb),    // %0
-        "+r"(dst_u),        // %1
+        "+r"(dst_u),       // %1
-        "+r"(dst_v),        // %2
+        "+r"(dst_v),       // %2
-        "+r"(width)         // %3
+        "+r"(width)        // %3
-      : "r"(&c->kRGBToU),   // %4
+      : "r"(&c->kRGBToU),  // %4
-        "r"(&c->kRGBToV),   // %5
+        "r"(&c->kRGBToV),  // %5
-        "r"(&c->kAddUV)     // %6
+        "r"(&c->kAddUV)    // %6
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
        "q10", "q11", "q12");
 }
@ -1934,8 +1934,9 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
                            int width,
                            const struct ArgbConstants* c) {
  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile (
+  asm volatile(
-      "vld1.8      {d24}, [%5]                   \n"  // load kRGBToU (8 bytes, only 4 used)
+      "vld1.8      {d24}, [%5]                   \n"  // load kRGBToU (8 bytes,
                                                      // only 4 used)
      "vld1.8      {d25}, [%6]                   \n"  // load kRGBToV
      "vmovl.s8    q14, d24                      \n"  // U coeffs in d28
      "vmovl.s8    q15, d25                      \n"  // V coeffs in d30
@ -1943,7 +1944,8 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
      "1:          \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
                                                      // pixels.
      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
@ -1985,16 +1987,15 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
      "bgt         1b                            \n"
-  : "+r"(src_argb),  // %0
+      : "+r"(src_argb),    // %0
-    "+r"(src_argb_1),  // %1
+        "+r"(src_argb_1),  // %1
-    "+r"(dst_u),     // %2
+        "+r"(dst_u),       // %2
-    "+r"(dst_v),     // %3
+        "+r"(dst_v),       // %3
-    "+r"(width)        // %4
+        "+r"(width)        // %4
-  : "r"(&c->kRGBToU),  // %5
+      : "r"(&c->kRGBToU),  // %5
-    "r"(&c->kRGBToV)   // %6
+        "r"(&c->kRGBToV)   // %6
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-    "q8", "q9", "q11", "q12", "q14", "q15"
+        "q9", "q11", "q12", "q14", "q15");
  );
 }
 void ARGBToUVRow_NEON(const uint8_t* src_argb,
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -2736,26 +2736,26 @@ struct RgbUVConstants {
 };
 // 8x1 pixels.
-void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_u,
-                               uint8_t* dst_u,
+                               uint8_t* dst_v, int width,
                               uint8_t* dst_v,
                               int width,
                               const struct ArgbConstants* c) {
  asm volatile(
-      "ldr        q16, [%[c], #16]               \n" // kRGBToU
+      "ldr        q16, [%[c], #16]               \n"  // kRGBToU
-      "ldr        q17, [%[c], #32]               \n" // kRGBToV
+      "ldr        q17, [%[c], #32]               \n"  // kRGBToV
-      "ldr        s0, [%[c], #64]                \n" // kAddUV
+      "ldr        s0, [%[c], #64]                \n"  // kAddUV
-      "sxtl       v16.8h, v16.8b                 \n" // sign extend U coeffs to 16-bit
+      "sxtl       v16.8h, v16.8b                 \n"  // sign extend U coeffs to
-      "sxtl       v17.8h, v17.8b                 \n" // sign extend V coeffs to 16-bit
+                                                      // 16-bit
-      "dup        v20.8h, v16.h[0]               \n" // U0
+      "sxtl       v17.8h, v17.8b                 \n"  // sign extend V coeffs to
-      "dup        v21.8h, v16.h[1]               \n" // U1
+                                                      // 16-bit
-      "dup        v22.8h, v16.h[2]               \n" // U2
+      "dup        v20.8h, v16.h[0]               \n"  // U0
-      "dup        v23.8h, v16.h[3]               \n" // U3
+      "dup        v21.8h, v16.h[1]               \n"  // U1
-      "dup        v24.8h, v17.h[0]               \n" // V0
+      "dup        v22.8h, v16.h[2]               \n"  // U2
-      "dup        v26.8h, v17.h[1]               \n" // V1
+      "dup        v23.8h, v16.h[3]               \n"  // U3
-      "dup        v27.8h, v17.h[2]               \n" // V2
+      "dup        v24.8h, v17.h[0]               \n"  // V0
-      "dup        v28.8h, v17.h[3]               \n" // V3
+      "dup        v26.8h, v17.h[1]               \n"  // V1
-      "dup        v25.8h, v0.h[0]                \n" // kAddUV
+      "dup        v27.8h, v17.h[2]               \n"  // V2
      "dup        v28.8h, v17.h[3]               \n"  // V3
      "dup        v25.8h, v0.h[0]                \n"  // kAddUV
      "1:          \n"
      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
@ -2783,27 +2783,25 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
      "st1         {v0.8b}, [%1], #8             \n"
      "st1         {v1.8b}, [%2], #8             \n"
      "b.gt        1b                            \n"
-      : "+r"(src_argb),     // %0
+      : "+r"(src_argb),  // %0
-        "+r"(dst_u),        // %1
+        "+r"(dst_u),     // %1
-        "+r"(dst_v),        // %2
+        "+r"(dst_v),     // %2
-        "+r"(width)         // %3
+        "+r"(width)      // %3
-      : [c] "r"(c)          // %4
+      : [c] "r"(c)       // %4
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-        "v26", "v27", "v28");
+        "v27", "v28");
 }
-static void ARGBToUV444MatrixRow_NEON_I8MM(
+static void ARGBToUV444MatrixRow_NEON_I8MM(const uint8_t* src_argb,
-    const uint8_t* src_argb,
+                                           uint8_t* dst_u, uint8_t* dst_v,
-    uint8_t* dst_u,
+                                           int width,
-    uint8_t* dst_v,
+                                           const struct ArgbConstants* c) {
    int width,
    const struct ArgbConstants* c) {
  asm volatile(
-      "ldr         q16, [%[c], #16]              \n" // kRGBToU
+      "ldr         q16, [%[c], #16]              \n"  // kRGBToU
-      "ldr         q17, [%[c], #32]              \n" // kRGBToV
+      "ldr         q17, [%[c], #32]              \n"  // kRGBToV
-      "ldr         s0, [%[c], #64]               \n" // kAddUV
+      "ldr         s0, [%[c], #64]               \n"  // kAddUV
-      "dup         v29.8h, v0.h[0]               \n" // 128.0
+      "dup         v29.8h, v0.h[0]               \n"  // 128.0
      "1:          \n"
      "ldp         q0, q1, [%[src]], #32         \n"
      "subs        %w[width], %w[width], #8      \n"  // 8 processed per loop.
@ -2823,11 +2821,11 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
      "str         d0, [%[dst_u]], #8            \n"  // store 8 pixels U.
      "str         d1, [%[dst_v]], #8            \n"  // store 8 pixels V.
      "b.gt        1b                            \n"
-      : [src] "+r"(src_argb),     // %[src]
+      : [src] "+r"(src_argb),  // %[src]
-        [dst_u] "+r"(dst_u),      // %[dst_u]
+        [dst_u] "+r"(dst_u),   // %[dst_u]
-        [dst_v] "+r"(dst_v),      // %[dst_v]
+        [dst_v] "+r"(dst_v),   // %[dst_v]
-        [width] "+r"(width)       // %[width]
+        [width] "+r"(width)    // %[width]
-      : [c] "r"(c)  // %[c]
+      : [c] "r"(c)             // %[c]
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17",
        "v29");
 }
@ -2844,8 +2842,7 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
-  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbI601Constants);
                            &kArgbI601Constants);
 }
 void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
@ -2860,8 +2857,7 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbJPEGConstants);
                            &kArgbJPEGConstants);
 }
 void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
@ -2903,23 +2899,27 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
                            int width,
                            const struct ArgbConstants* c) {
  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile (
+  asm volatile(
-      "ldr        q16, [%[c], #16]               \n" // kRGBToU
+      "ldr        q16, [%[c], #16]               \n"  // kRGBToU
-      "ldr        q17, [%[c], #32]               \n" // kRGBToV
+      "ldr        q17, [%[c], #32]               \n"  // kRGBToV
-      "sxtl       v16.8h, v16.8b                 \n" // sign extend U coeffs to 16-bit
+      "sxtl       v16.8h, v16.8b                 \n"  // sign extend U coeffs to
-      "sxtl       v17.8h, v17.8b                 \n" // sign extend V coeffs to 16-bit
+                                                      // 16-bit
-      "dup        v20.8h, v16.h[0]               \n" // U0
+      "sxtl       v17.8h, v17.8b                 \n"  // sign extend V coeffs to
-      "dup        v21.8h, v16.h[1]               \n" // U1
+                                                      // 16-bit
-      "dup        v22.8h, v16.h[2]               \n" // U2
+      "dup        v20.8h, v16.h[0]               \n"  // U0
-      "dup        v23.8h, v16.h[3]               \n" // U3
+      "dup        v21.8h, v16.h[1]               \n"  // U1
-      "dup        v24.8h, v17.h[0]               \n" // V0
+      "dup        v22.8h, v16.h[2]               \n"  // U2
-      "dup        v26.8h, v17.h[1]               \n" // V1
+      "dup        v23.8h, v16.h[3]               \n"  // U3
-      "dup        v27.8h, v17.h[2]               \n" // V2
+      "dup        v24.8h, v17.h[0]               \n"  // V0
-      "dup        v28.8h, v17.h[3]               \n" // V3
+      "dup        v26.8h, v17.h[1]               \n"  // V1
-      "movi       v25.8h, #0x80, lsl #8          \n" // 128.0 in 16-bit (0x8000)
+      "dup        v27.8h, v17.h[2]               \n"  // V2
      "dup        v28.8h, v17.h[3]               \n"  // V3
      "movi       v25.8h, #0x80, lsl #8          \n"  // 128.0 in 16-bit
                                                      // (0x8000)
      "1:          \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
                                                                 // pixels.
      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%0, 448]          \n"
@ -2927,7 +2927,8 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
      "uaddlp      v18.8h, v3.16b                \n"  // A 16 bytes -> 8 shorts.
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16
                                                                 // more.
      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%1, 448]          \n"
      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
@ -2958,16 +2959,15 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
      "b.gt        1b                            \n"
-  : "+r"(src_argb),  // %0
+      : "+r"(src_argb),    // %0
-    "+r"(src_argb_1),  // %1
+        "+r"(src_argb_1),  // %1
-    "+r"(dst_u),     // %2
+        "+r"(dst_u),       // %2
-    "+r"(dst_v),     // %3
+        "+r"(dst_v),       // %3
-    "+r"(width)        // %4
+        "+r"(width)        // %4
-  : [c] "r"(c)         // %5
+      : [c] "r"(c)         // %5
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-    "v16", "v17", "v18", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+        "v17", "v18", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
-    "v27", "v28"
+        "v28");
  );
 }
 void ARGBToUVRow_NEON(const uint8_t* src_argb,
@ -2988,29 +2988,20 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                         &kArgbJPEGConstants);
 }
-void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+void ABGRToUVRow_NEON(const uint8_t* src_abgr, int src_stride_abgr,
-                      int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) {
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_u, dst_v, width,
                         &kAbgrI601Constants);
 }
-void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+void BGRAToUVRow_NEON(const uint8_t* src_bgra, int src_stride_bgra,
-                      int src_stride_bgra,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) {
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  ARGBToUVMatrixRow_NEON(src_bgra, src_stride_bgra, dst_u, dst_v, width,
                         &kBgraI601Constants);
 }
-void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+void RGBAToUVRow_NEON(const uint8_t* src_rgba, int src_stride_rgba,
-                      int src_stride_rgba,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) {
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  ARGBToUVMatrixRow_NEON(src_rgba, src_stride_rgba, dst_u, dst_v, width,
                         &kRgbaI601Constants);
 }
@ -3329,12 +3320,10 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
 }
 // Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the ArgbConstants layout.
-static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
+static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src, int src_stride,
-                                        int src_stride,
+                                             uint8_t* dst_u, uint8_t* dst_v,
-                                        uint8_t* dst_u,
+                                             int width,
-                                        uint8_t* dst_v,
+                                             const struct ArgbConstants* c) {
                                        int width,
                                        const struct ArgbConstants* c) {
  const uint8_t* src1 = src + src_stride;
  asm volatile(
      "movi        v23.8h, #0x80, lsl #8           \n"  // 128.0 (0x8000 in
@ -3388,12 +3377,12 @@ static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
      "str         d0, [%[dst_u]], #8              \n"  // store 8 pixels U
      "str         d1, [%[dst_v]], #8              \n"  // store 8 pixels V
      "b.gt        1b                              \n"
-      : [src] "+r"(src),                // %[src]
+      : [src] "+r"(src),      // %[src]
-        [src1] "+r"(src1),              // %[src1]
+        [src1] "+r"(src1),    // %[src1]
-        [dst_u] "+r"(dst_u),            // %[dst_u]
+        [dst_u] "+r"(dst_u),  // %[dst_u]
-        [dst_v] "+r"(dst_v),            // %[dst_v]
+        [dst_v] "+r"(dst_v),  // %[dst_v]
-        [width] "+r"(width)             // %[width]
+        [width] "+r"(width)   // %[width]
-      : [c] "r"(c)                      // %[c]
+      : [c] "r"(c)            // %[c]
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v23",
        "v24", "v25");
 }
@ -3404,8 +3393,8 @@ void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
                                 uint8_t* dst_v,
                                 int width,
                                 const struct ArgbConstants* c) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v,
-                                   c);
+                                   width, c);
 }
 void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
@ -3413,8 +3402,8 @@ void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v,
-                              &kArgbI601Constants);
+                                   width, &kArgbI601Constants);
 }
 void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
@ -3422,8 +3411,8 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v,
-                              &kAbgrI601Constants);
+                                   width, &kAbgrI601Constants);
 }
 void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
@ -3431,8 +3420,8 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width,
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v,
-                              &kBgraI601Constants);
+                                   width, &kBgraI601Constants);
 }
 void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
@ -3440,8 +3429,8 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width,
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v,
-                              &kRgbaI601Constants);
+                                   width, &kRgbaI601Constants);
 }
 void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
@ -3449,8 +3438,8 @@ void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v,
-                              &kArgbJPEGConstants);
+                                   width, &kArgbJPEGConstants);
 }
 void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
@ -3458,8 +3447,8 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v,
-                              &kAbgrJPEGConstants);
+                                   width, &kAbgrJPEGConstants);
 }
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@ -3589,15 +3578,14 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
      "addhn       v1.8b, v1.8h, v22.8h          \n"
      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
      "b.gt        1b                            \n"
-      : "+r"(src_argb),    // %0
+      : "+r"(src_argb),  // %0
-        "+r"(dst_y),       // %1
+        "+r"(dst_y),     // %1
-        "+r"(width)        // %2
+        "+r"(width)      // %2
-      : "r"(c)             // %3
+      : "r"(c)           // %3
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
        "v19", "v20", "v21", "v22");
 }
 void ARGBToYMatrixRow_NEON_DotProd(
    const uint8_t* src_argb,
    uint8_t* dst_y,
@ -3625,14 +3613,14 @@ void ARGBToYMatrixRow_NEON_DotProd(
      "addhn       v1.8b, v1.8h, v19.8h          \n"
      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
      "b.gt        1b                            \n"
-      : "+r"(src_argb),    // %0
+      : "+r"(src_argb),  // %0
-        "+r"(dst_y),       // %1
+        "+r"(dst_y),     // %1
-        "+r"(width)        // %2
+        "+r"(width)      // %2
-      : "r"(c)             // %3
+      : "r"(c)           // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19");
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
        "v17", "v18", "v19");
 }
 // RGB to JPeg coefficients
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -3732,10 +3720,10 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
      "addhn       v1.8b, v1.8h, v21.8h          \n"
      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
      "b.gt        1b                            \n"
-      : "+r"(src_rgb),     // %0
+      : "+r"(src_rgb),  // %0
-        "+r"(dst_y),       // %1
+        "+r"(dst_y),    // %1
-        "+r"(width)        // %2
+        "+r"(width)     // %2
-      : "r"(c)  // %3
+      : "r"(c)          // %3
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18",
        "v19", "v20", "v21");
 }
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -116,10 +116,8 @@ extern "C" {
 // Convert 32 ARGB pixels (128 bytes) to 32 UV444 values.
 #if defined(HAS_ARGBTOYMATRIXROW_AVX2) || defined(HAS_ARGBTOUV444MATRIXROW_AVX2)
 LIBYUV_TARGET_AVX2
-void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
+void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, uint8_t* dst_u,
-                               uint8_t* dst_u,
+                               uint8_t* dst_v, int width,
                               uint8_t* dst_v,
                               int width,
                               const struct ArgbConstants* c) {
  __m256i ymm_u =
      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU));
@ -455,8 +453,8 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
 #ifdef HAS_MIRRORROW_AVX2
 LIBYUV_TARGET_AVX2
 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  __m256i ymm_shuf =
+  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
-      _mm256_broadcastsi128_si256(_mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+      _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
  src += width;
  while (width > 0) {
    src -= 32;
@ -473,8 +471,8 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_MIRRORUVROW_AVX2
 LIBYUV_TARGET_AVX2
 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  __m256i ymm_shuf =
+  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
-      _mm256_broadcastsi128_si256(_mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+      _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
  src_uv += width * 2;
  while (width > 0) {
    src_uv -= 32;
@ -490,12 +488,10 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
 #ifdef HAS_MIRRORSPLITUVROW_AVX2
 LIBYUV_TARGET_AVX2
-void MirrorSplitUVRow_AVX2(const uint8_t* src_uv,
+void MirrorSplitUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_u,
-                           uint8_t* dst_u,
+                           uint8_t* dst_v, int width) {
-                           uint8_t* dst_v,
+  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
-                           int width) {
+      _mm_setr_epi8(14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1));
  __m256i ymm_shuf =
      _mm256_broadcastsi128_si256(_mm_setr_epi8(14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1));
  src_uv += width * 2;
  while (width > 0) {
    src_uv -= 32;
@ -513,28 +509,30 @@ void MirrorSplitUVRow_AVX2(const uint8_t* src_uv,
 #ifdef HAS_RGB24MIRRORROW_AVX2
 LIBYUV_TARGET_AVX2
-void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
+void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_rgb24,
                         uint8_t* dst_rgb24,
                         int width) {
-  __m256i shuf0 = _mm256_setr_epi8(
+  __m256i shuf0 =
-      -1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2,
+      _mm256_setr_epi8(-1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2, -1,
-      -1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2);
+                       12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2);
-  __m128i shuf1 = _mm_setr_epi8(
+  __m128i shuf1 =
-      13, 14, 15, 10, 11, 12, 7, 8, 9, 4, 5, 6, 1, 2, 3, -1);
+      _mm_setr_epi8(13, 14, 15, 10, 11, 12, 7, 8, 9, 4, 5, 6, 1, 2, 3, -1);
  src_rgb24 += width * 3 - 96;
  while (width > 0) {
    __m128i v0_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 0));
    __m128i v0_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 15));
-    __m256i v0 = _mm256_inserti128_si256(_mm256_castsi128_si256(v0_lo), v0_hi, 1);
+    __m256i v0 =
        _mm256_inserti128_si256(_mm256_castsi128_si256(v0_lo), v0_hi, 1);
    __m128i v1_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 30));
    __m128i v1_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 45));
-    __m256i v1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v1_lo), v1_hi, 1);
+    __m256i v1 =
        _mm256_inserti128_si256(_mm256_castsi128_si256(v1_lo), v1_hi, 1);
    __m128i v2_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 60));
    __m128i v2_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 75));
-    __m256i v2 = _mm256_inserti128_si256(_mm256_castsi128_si256(v2_lo), v2_hi, 1);
+    __m256i v2 =
        _mm256_inserti128_si256(_mm256_castsi128_si256(v2_lo), v2_hi, 1);
    __m128i v3 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 80));
@ -544,11 +542,14 @@ void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
    v3 = _mm_shuffle_epi8(v3, shuf1);
    _mm_storeu_si128((__m128i*)(dst_rgb24 + 80), _mm256_castsi256_si128(v0));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 65), _mm256_extracti128_si256(v0, 1));
+    _mm_storeu_si128((__m128i*)(dst_rgb24 + 65),
                     _mm256_extracti128_si256(v0, 1));
    _mm_storeu_si128((__m128i*)(dst_rgb24 + 50), _mm256_castsi256_si128(v1));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 35), _mm256_extracti128_si256(v1, 1));
+    _mm_storeu_si128((__m128i*)(dst_rgb24 + 35),
                     _mm256_extracti128_si256(v1, 1));
    _mm_storeu_si128((__m128i*)(dst_rgb24 + 20), _mm256_castsi256_si128(v2));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 5), _mm256_extracti128_si256(v2, 1));
+    _mm_storeu_si128((__m128i*)(dst_rgb24 + 5),
                     _mm256_extracti128_si256(v2, 1));
    _mm_storel_epi64((__m128i*)(dst_rgb24 + 0), v3);
    src_rgb24 -= 96;
@ -560,10 +561,8 @@ void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
 #ifdef HAS_INTERPOLATEROW_AVX2
 LIBYUV_TARGET_AVX2
-void InterpolateRow_AVX2(uint8_t* dst_ptr,
+void InterpolateRow_AVX2(uint8_t* dst_ptr, const uint8_t* src_ptr,
-                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int width,
                         ptrdiff_t src_stride,
                         int width,
                         int source_y_fraction) {
  int y1 = source_y_fraction;
  int y0 = 256 - y1;
@ -607,10 +606,8 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,
 #ifdef HAS_INTERPOLATEROW_16_AVX2
 LIBYUV_TARGET_AVX2
-void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
+void InterpolateRow_16_AVX2(uint16_t* dst_ptr, const uint16_t* src_ptr,
-                            const uint16_t* src_ptr,
+                            ptrdiff_t src_stride, int width,
                            ptrdiff_t src_stride,
                            int width,
                            int source_y_fraction) {
  int y1 = source_y_fraction;
  int y0 = 256 - y1;
@ -629,7 +626,8 @@ void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
    for (i = 0; i < width; i += 16) {
      __m256i row0 = _mm256_loadu_si256((const __m256i*)(src_ptr + i));
      __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_ptr1 + i));
-      _mm256_storeu_si256((__m256i*)(dst_ptr + i), _mm256_avg_epu16(row0, row1));
+      _mm256_storeu_si256((__m256i*)(dst_ptr + i),
                          _mm256_avg_epu16(row0, row1));
    }
  } else {
    for (i = 0; i < width; i += 16) {
@ -672,21 +670,23 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_J400TOARGBROW_AVX2
 alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_0[32] = {
    0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, 3u, 128u,
-    4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u
+    4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u};
 };
 alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_1[32] = {
-    8u, 8u, 8u, 128u, 9u, 9u, 9u, 128u, 10u, 10u, 10u, 128u, 11u, 11u, 11u, 128u,
+    8u,   8u,   8u,  128u, 9u,   9u,   9u,  128u, 10u,  10u, 10u,
-    12u, 12u, 12u, 128u, 13u, 13u, 13u, 128u, 14u, 14u, 14u, 128u, 15u, 15u, 15u, 128u
+    128u, 11u,  11u, 11u,  128u, 12u,  12u, 12u,  128u, 13u, 13u,
-};
+    13u,  128u, 14u, 14u,  14u,  128u, 15u, 15u,  15u,  128u};
 LIBYUV_TARGET_AVX2
 void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  __m256i ymm_mask0 = _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_0);
+  __m256i ymm_mask0 =
-  __m256i ymm_mask1 = _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_1);
+      _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_0);
  __m256i ymm_mask1 =
      _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_1);
  __m256i ymm_alpha = _mm256_set1_epi32((int)0xff000000u);
  while (width > 0) {
-    __m256i ymm0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)src_y));
+    __m256i ymm0 =
        _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)src_y));
    __m256i ymm1 = _mm256_shuffle_epi8(ymm0, ymm_mask0);
    __m256i ymm2 = _mm256_shuffle_epi8(ymm0, ymm_mask1);
@ -707,13 +707,14 @@ void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 #ifdef HAS_RGB24TOARGBROW_AVX2
 alignas(16) static const uint8_t kShuffleMaskRGB24ToARGB[2][16] = {
    {0u, 1u, 2u, 128u, 3u, 4u, 5u, 128u, 6u, 7u, 8u, 128u, 9u, 10u, 11u, 128u},
-    {4u, 5u, 6u, 128u, 7u, 8u, 9u, 128u, 10u, 11u, 12u, 128u, 13u, 14u, 15u, 128u}
+    {4u, 5u, 6u, 128u, 7u, 8u, 9u, 128u, 10u, 11u, 12u, 128u, 13u, 14u, 15u,
-};
+     128u}};
 #endif
 #ifdef HAS_RGB565TOARGBROW_AVX2
 LIBYUV_TARGET_AVX2
-void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) {
+void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb,
                          int width) {
  __m256i ymm_scale_rb = _mm256_set1_epi32(0x01080108);
  __m256i ymm_scale_g = _mm256_set1_epi32(0x20802080);
  __m256i ymm_mask_b = _mm256_set1_epi16((short)0xf800);
@ -730,11 +731,11 @@ void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int widt
    ymm1 = _mm256_mulhi_epu16(ymm1, ymm_scale_rb);
    ymm2 = _mm256_mulhi_epu16(ymm2, ymm_scale_rb);
    ymm1 = _mm256_slli_epi16(ymm1, 8);
-    ymm1 = _mm256_or_si256(ymm1, ymm2); // RB
+    ymm1 = _mm256_or_si256(ymm1, ymm2);  // RB
    ymm0 = _mm256_and_si256(ymm0, ymm_mask_g);
    ymm0 = _mm256_mulhi_epu16(ymm0, ymm_scale_g);
-    ymm0 = _mm256_or_si256(ymm0, ymm_mask_a); // GA
+    ymm0 = _mm256_or_si256(ymm0, ymm_mask_a);  // GA
    ymm2 = _mm256_unpacklo_epi8(ymm1, ymm0);
    ymm1 = _mm256_unpackhi_epi8(ymm1, ymm0);
@ -755,7 +756,8 @@ void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int widt
 #ifdef HAS_ARGB1555TOARGBROW_AVX2
 LIBYUV_TARGET_AVX2
-void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) {
+void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_argb,
                            int width) {
  __m256i ymm_scale_rb = _mm256_set1_epi32(0x01080108);
  __m256i ymm_scale_g = _mm256_set1_epi32(0x42004200);
  __m256i ymm_mask_b = _mm256_set1_epi16((short)0xf800);
@ -773,14 +775,14 @@ void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_argb, int
    ymm2 = _mm256_mulhi_epu16(ymm2, ymm_scale_rb);
    ymm1 = _mm256_mulhi_epu16(ymm1, ymm_scale_rb);
    ymm1 = _mm256_slli_epi16(ymm1, 8);
-    ymm1 = _mm256_or_si256(ymm1, ymm2); // RB
+    ymm1 = _mm256_or_si256(ymm1, ymm2);  // RB
    ymm2 = ymm0;
    ymm0 = _mm256_and_si256(ymm0, ymm_mask_g);
    ymm2 = _mm256_srai_epi16(ymm2, 8);
    ymm0 = _mm256_mulhi_epu16(ymm0, ymm_scale_g);
    ymm2 = _mm256_and_si256(ymm2, ymm_mask_a);
-    ymm0 = _mm256_or_si256(ymm0, ymm2); // GA
+    ymm0 = _mm256_or_si256(ymm0, ymm2);  // GA
    ymm2 = _mm256_unpacklo_epi8(ymm1, ymm0);
    ymm1 = _mm256_unpackhi_epi8(ymm1, ymm0);
@ -801,7 +803,8 @@ void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_argb, int
 #ifdef HAS_ARGB4444TOARGBROW_AVX2
 LIBYUV_TARGET_AVX2
-void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) {
+void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, uint8_t* dst_argb,
                            int width) {
  __m256i ymm_mask = _mm256_set1_epi32(0x0f0f0f0f);
  __m256i ymm_mask2 = _mm256_slli_epi32(ymm_mask, 4);
@ -841,27 +844,34 @@ void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, uint8_t* dst_argb, int
 #ifdef HAS_RGB24TOARGBROW_AVX2
 LIBYUV_TARGET_AVX2
-void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
+void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb,
                         int width) {
  __m256i ymm_alpha = _mm256_set1_epi32(0xff000000);
-  __m256i ymm_shuf = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[0]));
+  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
-  __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[1]));
+      _mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[0]));
  __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(
      _mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[1]));
  while (width > 0) {
    __m128i xmm0 = _mm_loadu_si128((const __m128i*)src_rgb24);
    __m256i ymm0 = _mm256_castsi128_si256(xmm0);
-    ymm0 = _mm256_inserti128_si256(ymm0, _mm_loadu_si128((const __m128i*)(src_rgb24 + 12)), 1);
+    ymm0 = _mm256_inserti128_si256(
        ymm0, _mm_loadu_si128((const __m128i*)(src_rgb24 + 12)), 1);
    __m128i xmm1 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 24));
    __m256i ymm1 = _mm256_castsi128_si256(xmm1);
-    ymm1 = _mm256_inserti128_si256(ymm1, _mm_loadu_si128((const __m128i*)(src_rgb24 + 36)), 1);
+    ymm1 = _mm256_inserti128_si256(
        ymm1, _mm_loadu_si128((const __m128i*)(src_rgb24 + 36)), 1);
    __m128i xmm2 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 48));
    __m256i ymm2 = _mm256_castsi128_si256(xmm2);
-    ymm2 = _mm256_inserti128_si256(ymm2, _mm_loadu_si128((const __m128i*)(src_rgb24 + 60)), 1);
+    ymm2 = _mm256_inserti128_si256(
        ymm2, _mm_loadu_si128((const __m128i*)(src_rgb24 + 60)), 1);
    __m128i xmm3 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 68));
    __m256i ymm3 = _mm256_castsi128_si256(xmm3);
-    ymm3 = _mm256_inserti128_si256(ymm3, _mm_loadu_si128((const __m128i*)(src_rgb24 + 80)), 1);
+    ymm3 = _mm256_inserti128_si256(
        ymm3, _mm_loadu_si128((const __m128i*)(src_rgb24 + 80)), 1);
    ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
    ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
@ -886,6 +896,46 @@ void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width)
 }
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
 LIBYUV_TARGET_AVX2
 void ARGBShuffleRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb,
                         const uint8_t* shuffler, int width) {
  __m256i control =
      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)shuffler));
  while (width >= 16) {
    __m256i row = _mm256_loadu_si256((const __m256i*)src_argb);
    __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
    row = _mm256_shuffle_epi8(row, control);
    row1 = _mm256_shuffle_epi8(row1, control);
    _mm256_storeu_si256((__m256i*)dst_argb, row);
    _mm256_storeu_si256((__m256i*)(dst_argb + 32), row1);
    src_argb += 64;
    dst_argb += 64;
    width -= 16;
  }
 }
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_AVX512BW
 LIBYUV_TARGET_AVX512BW
 void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_argb,
                             const uint8_t* shuffler, int width) {
  __m512i control =
      _mm512_broadcast_i32x4(_mm_loadu_si128((const __m128i*)shuffler));
  while (width >= 32) {
    __m512i row = _mm512_loadu_si512((const __m512i*)src_argb);
    __m512i row1 = _mm512_loadu_si512((const __m512i*)(src_argb + 64));
    row = _mm512_shuffle_epi8(row, control);
    row1 = _mm512_shuffle_epi8(row1, control);
    _mm512_storeu_si512((__m512i*)dst_argb, row);
    _mm512_storeu_si512((__m512i*)(dst_argb + 64), row1);
    src_argb += 128;
    dst_argb += 128;
    width -= 32;
  }
 }
 #endif
 #endif
 #ifdef __cplusplus
--- a/source/scale.cc
+++ b/source/scale.cc
@ -36,15 +36,10 @@ static __inline int Abs(int v) {
 // This is an optimized version for scaling down a plane to 1/2 of
 // its original size.
-static void ScalePlaneDown2(int src_width,
+static void ScalePlaneDown2(int src_width, int src_height, int dst_width,
-                            int src_height,
+                            int dst_height, ptrdiff_t src_stride,
-                            int dst_width,
+                            ptrdiff_t dst_stride, const uint8_t* src_ptr,
-                            int dst_height,
+                            uint8_t* dst_ptr, enum FilterMode filtering) {
                            ptrdiff_t src_stride,
                            ptrdiff_t dst_stride,
                            const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            enum FilterMode filtering) {
  int y;
  void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                        uint8_t* dst_ptr, int dst_width) =
@ -148,15 +143,10 @@ static void ScalePlaneDown2(int src_width,
  }
 }
-static void ScalePlaneDown2_16(int src_width,
+static void ScalePlaneDown2_16(int src_width, int src_height, int dst_width,
-                               int src_height,
+                               int dst_height, ptrdiff_t src_stride,
-                               int dst_width,
+                               ptrdiff_t dst_stride, const uint16_t* src_ptr,
-                               int dst_height,
+                               uint16_t* dst_ptr, enum FilterMode filtering) {
                               ptrdiff_t src_stride,
                               ptrdiff_t dst_stride,
                               const uint16_t* src_ptr,
                               uint16_t* dst_ptr,
                               enum FilterMode filtering) {
  int y;
  void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                        uint16_t* dst_ptr, int dst_width) =
@ -256,15 +246,10 @@ void ScalePlaneDown2_16To8(int src_width,
 // This is an optimized version for scaling down a plane to 1/4 of
 // its original size.
-static void ScalePlaneDown4(int src_width,
+static void ScalePlaneDown4(int src_width, int src_height, int dst_width,
-                            int src_height,
+                            int dst_height, ptrdiff_t src_stride,
-                            int dst_width,
+                            ptrdiff_t dst_stride, const uint8_t* src_ptr,
-                            int dst_height,
+                            uint8_t* dst_ptr, enum FilterMode filtering) {
                            ptrdiff_t src_stride,
                            ptrdiff_t dst_stride,
                            const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            enum FilterMode filtering) {
  int y;
  void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                        uint8_t* dst_ptr, int dst_width) =
@ -328,15 +313,10 @@ static void ScalePlaneDown4(int src_width,
  }
 }
-static void ScalePlaneDown4_16(int src_width,
+static void ScalePlaneDown4_16(int src_width, int src_height, int dst_width,
-                               int src_height,
+                               int dst_height, ptrdiff_t src_stride,
-                               int dst_width,
+                               ptrdiff_t dst_stride, const uint16_t* src_ptr,
-                               int dst_height,
+                               uint16_t* dst_ptr, enum FilterMode filtering) {
                               ptrdiff_t src_stride,
                               ptrdiff_t dst_stride,
                               const uint16_t* src_ptr,
                               uint16_t* dst_ptr,
                               enum FilterMode filtering) {
  int y;
  void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                        uint16_t* dst_ptr, int dst_width) =
@ -372,15 +352,10 @@ static void ScalePlaneDown4_16(int src_width,
 }
 // Scale plane down, 3/4
-static void ScalePlaneDown34(int src_width,
+static void ScalePlaneDown34(int src_width, int src_height, int dst_width,
-                             int src_height,
+                             int dst_height, ptrdiff_t src_stride,
-                             int dst_width,
+                             ptrdiff_t dst_stride, const uint8_t* src_ptr,
-                             int dst_height,
+                             uint8_t* dst_ptr, enum FilterMode filtering) {
                             ptrdiff_t src_stride,
                             ptrdiff_t dst_stride,
                             const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             enum FilterMode filtering) {
  int y;
  void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                           uint8_t* dst_ptr, int dst_width);
@ -499,15 +474,10 @@ static void ScalePlaneDown34(int src_width,
  }
 }
-static void ScalePlaneDown34_16(int src_width,
+static void ScalePlaneDown34_16(int src_width, int src_height, int dst_width,
-                                int src_height,
+                                int dst_height, ptrdiff_t src_stride,
-                                int dst_width,
+                                ptrdiff_t dst_stride, const uint16_t* src_ptr,
-                                int dst_height,
+                                uint16_t* dst_ptr, enum FilterMode filtering) {
                                ptrdiff_t src_stride,
                                ptrdiff_t dst_stride,
                                const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                enum FilterMode filtering) {
  int y;
  void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                           uint16_t* dst_ptr, int dst_width);
@ -585,15 +555,10 @@ static void ScalePlaneDown34_16(int src_width,
 // ggghhhii
 // Boxes are 3x3, 2x3, 3x2 and 2x2
-static void ScalePlaneDown38(int src_width,
+static void ScalePlaneDown38(int src_width, int src_height, int dst_width,
-                             int src_height,
+                             int dst_height, ptrdiff_t src_stride,
-                             int dst_width,
+                             ptrdiff_t dst_stride, const uint8_t* src_ptr,
-                             int dst_height,
+                             uint8_t* dst_ptr, enum FilterMode filtering) {
                             ptrdiff_t src_stride,
                             ptrdiff_t dst_stride,
                             const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             enum FilterMode filtering) {
  int y;
  void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                           uint8_t* dst_ptr, int dst_width);
@ -705,15 +670,10 @@ static void ScalePlaneDown38(int src_width,
  }
 }
-static void ScalePlaneDown38_16(int src_width,
+static void ScalePlaneDown38_16(int src_width, int src_height, int dst_width,
-                                int src_height,
+                                int dst_height, ptrdiff_t src_stride,
-                                int dst_width,
+                                ptrdiff_t dst_stride, const uint16_t* src_ptr,
-                                int dst_height,
+                                uint16_t* dst_ptr, enum FilterMode filtering) {
                                ptrdiff_t src_stride,
                                ptrdiff_t dst_stride,
                                const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                enum FilterMode filtering) {
  int y;
  void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                           uint16_t* dst_ptr, int dst_width);
@ -898,13 +858,9 @@ static void ScaleAddCols1_16_C(int dst_width,
 // one pixel of destination using fixed point (16.16) to step
 // through source, sampling a box of pixel with simple
 // averaging.
-static int ScalePlaneBox(int src_width,
+static int ScalePlaneBox(int src_width, int src_height, int dst_width,
-                         int src_height,
+                         int dst_height, ptrdiff_t src_stride,
-                         int dst_width,
+                         ptrdiff_t dst_stride, const uint8_t* src_ptr,
                         int dst_height,
                         ptrdiff_t src_stride,
                         ptrdiff_t dst_stride,
                         const uint8_t* src_ptr,
                         uint8_t* dst_ptr) {
  int j, k;
  // Initial source x/y coordinate and step values as 16.16 fixed point.
@ -987,13 +943,9 @@ static int ScalePlaneBox(int src_width,
  return 0;
 }
-static int ScalePlaneBox_16(int src_width,
+static int ScalePlaneBox_16(int src_width, int src_height, int dst_width,
-                            int src_height,
+                            int dst_height, ptrdiff_t src_stride,
-                            int dst_width,
+                            ptrdiff_t dst_stride, const uint16_t* src_ptr,
                            int dst_height,
                            ptrdiff_t src_stride,
                            ptrdiff_t dst_stride,
                            const uint16_t* src_ptr,
                            uint16_t* dst_ptr) {
  int j, k;
  // Initial source x/y coordinate and step values as 16.16 fixed point.
@ -1045,15 +997,10 @@ static int ScalePlaneBox_16(int src_width,
 }
 // Scale plane down with bilinear interpolation.
-static int ScalePlaneBilinearDown(int src_width,
+static int ScalePlaneBilinearDown(int src_width, int src_height, int dst_width,
-                                  int src_height,
+                                  int dst_height, ptrdiff_t src_stride,
-                                  int dst_width,
+                                  ptrdiff_t dst_stride, const uint8_t* src_ptr,
-                                  int dst_height,
+                                  uint8_t* dst_ptr, enum FilterMode filtering) {
                                  ptrdiff_t src_stride,
                                  ptrdiff_t dst_stride,
                                  const uint8_t* src_ptr,
                                  uint8_t* dst_ptr,
                                  enum FilterMode filtering) {
  // Initial source x/y coordinate and step values as 16.16 fixed point.
  int x = 0;
  int y = 0;
@ -1157,14 +1104,10 @@ static int ScalePlaneBilinearDown(int src_width,
  return 0;
 }
-static int ScalePlaneBilinearDown_16(int src_width,
+static int ScalePlaneBilinearDown_16(int src_width, int src_height,
-                                     int src_height,
+                                     int dst_width, int dst_height,
-                                     int dst_width,
+                                     ptrdiff_t src_stride, ptrdiff_t dst_stride,
-                                     int dst_height,
+                                     const uint16_t* src_ptr, uint16_t* dst_ptr,
                                     ptrdiff_t src_stride,
                                     ptrdiff_t dst_stride,
                                     const uint16_t* src_ptr,
                                     uint16_t* dst_ptr,
                                     enum FilterMode filtering) {
  // Initial source x/y coordinate and step values as 16.16 fixed point.
  int x = 0;
@ -1249,15 +1192,10 @@ static int ScalePlaneBilinearDown_16(int src_width,
 }
 // Scale up down with bilinear interpolation.
-static int ScalePlaneBilinearUp(int src_width,
+static int ScalePlaneBilinearUp(int src_width, int src_height, int dst_width,
-                                int src_height,
+                                int dst_height, ptrdiff_t src_stride,
-                                int dst_width,
+                                ptrdiff_t dst_stride, const uint8_t* src_ptr,
-                                int dst_height,
+                                uint8_t* dst_ptr, enum FilterMode filtering) {
                                ptrdiff_t src_stride,
                                ptrdiff_t dst_stride,
                                const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                enum FilterMode filtering) {
  int j;
  // Initial source x/y coordinate and step values as 16.16 fixed point.
  int x = 0;
@ -1398,13 +1336,9 @@ static int ScalePlaneBilinearUp(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original width, using linear interpolation.
 // This is used to scale U and V planes of I422 to I444.
-static void ScalePlaneUp2_Linear(int src_width,
+static void ScalePlaneUp2_Linear(int src_width, int src_height, int dst_width,
-                                 int src_height,
+                                 int dst_height, ptrdiff_t src_stride,
-                                 int dst_width,
+                                 ptrdiff_t dst_stride, const uint8_t* src_ptr,
                                 int dst_height,
                                 ptrdiff_t src_stride,
                                 ptrdiff_t dst_stride,
                                 const uint8_t* src_ptr,
                                 uint8_t* dst_ptr) {
  void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
      ScaleRowUp2_Linear_Any_C;
@ -1463,13 +1397,9 @@ static void ScalePlaneUp2_Linear(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original size, using bilinear interpolation.
 // This is used to scale U and V planes of I420 to I444.
-static void ScalePlaneUp2_Bilinear(int src_width,
+static void ScalePlaneUp2_Bilinear(int src_width, int src_height, int dst_width,
-                                   int src_height,
+                                   int dst_height, ptrdiff_t src_stride,
-                                   int dst_width,
+                                   ptrdiff_t dst_stride, const uint8_t* src_ptr,
                                   int dst_height,
                                   ptrdiff_t src_stride,
                                   ptrdiff_t dst_stride,
                                   const uint8_t* src_ptr,
                                   uint8_t* dst_ptr) {
  void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
                      uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
@ -1529,12 +1459,9 @@ static void ScalePlaneUp2_Bilinear(int src_width,
 // its original width, using linear interpolation.
 // stride is in count of uint16_t.
 // This is used to scale U and V planes of I210 to I410 and I212 to I412.
-static void ScalePlaneUp2_12_Linear(int src_width,
+static void ScalePlaneUp2_12_Linear(int src_width, int src_height,
-                                    int src_height,
+                                    int dst_width, int dst_height,
-                                    int dst_width,
+                                    ptrdiff_t src_stride, ptrdiff_t dst_stride,
                                    int dst_height,
                                    ptrdiff_t src_stride,
                                    ptrdiff_t dst_stride,
                                    const uint16_t* src_ptr,
                                    uint16_t* dst_ptr) {
  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
@ -1584,10 +1511,8 @@ static void ScalePlaneUp2_12_Linear(int src_width,
 // its original size, using bilinear interpolation.
 // stride is in count of uint16_t.
 // This is used to scale U and V planes of I010 to I410 and I012 to I412.
-static void ScalePlaneUp2_12_Bilinear(int src_width,
+static void ScalePlaneUp2_12_Bilinear(int src_width, int src_height,
-                                      int src_height,
+                                      int dst_width, int dst_height,
                                      int dst_width,
                                      int dst_height,
                                      ptrdiff_t src_stride,
                                      ptrdiff_t dst_stride,
                                      const uint16_t* src_ptr,
@ -1632,12 +1557,9 @@ static void ScalePlaneUp2_12_Bilinear(int src_width,
  }
 }
-static void ScalePlaneUp2_16_Linear(int src_width,
+static void ScalePlaneUp2_16_Linear(int src_width, int src_height,
-                                    int src_height,
+                                    int dst_width, int dst_height,
-                                    int dst_width,
+                                    ptrdiff_t src_stride, ptrdiff_t dst_stride,
                                    int dst_height,
                                    ptrdiff_t src_stride,
                                    ptrdiff_t dst_stride,
                                    const uint16_t* src_ptr,
                                    uint16_t* dst_ptr) {
  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
@ -1682,10 +1604,8 @@ static void ScalePlaneUp2_16_Linear(int src_width,
  }
 }
-static void ScalePlaneUp2_16_Bilinear(int src_width,
+static void ScalePlaneUp2_16_Bilinear(int src_width, int src_height,
-                                      int src_height,
+                                      int dst_width, int dst_height,
                                      int dst_width,
                                      int dst_height,
                                      ptrdiff_t src_stride,
                                      ptrdiff_t dst_stride,
                                      const uint16_t* src_ptr,
@ -1730,14 +1650,10 @@ static void ScalePlaneUp2_16_Bilinear(int src_width,
  }
 }
-static int ScalePlaneBilinearUp_16(int src_width,
+static int ScalePlaneBilinearUp_16(int src_width, int src_height, int dst_width,
-                                   int src_height,
+                                   int dst_height, ptrdiff_t src_stride,
                                   int dst_width,
                                   int dst_height,
                                   ptrdiff_t src_stride,
                                   ptrdiff_t dst_stride,
-                                   const uint16_t* src_ptr,
+                                   const uint16_t* src_ptr, uint16_t* dst_ptr,
                                   uint16_t* dst_ptr,
                                   enum FilterMode filtering) {
  int j;
  // Initial source x/y coordinate and step values as 16.16 fixed point.
@ -1864,13 +1780,9 @@ static int ScalePlaneBilinearUp_16(int src_width,
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
-static void ScalePlaneSimple(int src_width,
+static void ScalePlaneSimple(int src_width, int src_height, int dst_width,
-                             int src_height,
+                             int dst_height, ptrdiff_t src_stride,
-                             int dst_width,
+                             ptrdiff_t dst_stride, const uint8_t* src_ptr,
                             int dst_height,
                             ptrdiff_t src_stride,
                             ptrdiff_t dst_stride,
                             const uint8_t* src_ptr,
                             uint8_t* dst_ptr) {
  int i;
  void (*ScaleCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width,
@ -1900,13 +1812,9 @@ static void ScalePlaneSimple(int src_width,
  }
 }
-static void ScalePlaneSimple_16(int src_width,
+static void ScalePlaneSimple_16(int src_width, int src_height, int dst_width,
-                                int src_height,
+                                int dst_height, ptrdiff_t src_stride,
-                                int dst_width,
+                                ptrdiff_t dst_stride, const uint16_t* src_ptr,
                                int dst_height,
                                ptrdiff_t src_stride,
                                ptrdiff_t dst_stride,
                                const uint16_t* src_ptr,
                                uint16_t* dst_ptr) {
  int i;
  void (*ScaleCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width,
@ -1951,9 +1859,9 @@ int ScalePlane(const uint8_t* src,
  // Reject dimensions larger than 32768 (or smaller than -32768 for height).
  // This prevents FixedDiv signed integer overflows that can lead to division
  // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations.
-  if (!src || src_width <= 0 || src_height == 0 ||
+  if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 ||
-      src_width > 32768 || src_height < -32768 || src_height > 32768 ||
+      src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 ||
-      !dst || dst_width <= 0 || dst_height <= 0) {
+      dst_height <= 0) {
    return -1;
  }
  // Simplify filtering when possible.
@ -2059,9 +1967,9 @@ int ScalePlane_16(const uint16_t* src,
  // Reject dimensions larger than 32768 (or smaller than -32768 for height).
  // This prevents FixedDiv signed integer overflows that can lead to division
  // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations.
-  if (!src || src_width <= 0 || src_height == 0 ||
+  if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 ||
-      src_width > 32768 || src_height < -32768 || src_height > 32768 ||
+      src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 ||
-      !dst || dst_width <= 0 || dst_height <= 0) {
+      dst_height <= 0) {
    return -1;
  }
  // Simplify filtering when possible.
@ -2171,9 +2079,9 @@ int ScalePlane_12(const uint16_t* src,
  // Reject dimensions larger than 32768 (or smaller than -32768 for height).
  // This prevents FixedDiv signed integer overflows that can lead to division
  // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations.
-  if (!src || src_width <= 0 || src_height == 0 ||
+  if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 ||
-      src_width > 32768 || src_height < -32768 || src_height > 32768 ||
+      src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 ||
-      !dst || dst_width <= 0 || dst_height <= 0) {
+      dst_height <= 0) {
    return -1;
  }
  // Simplify filtering when possible.
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@ -34,18 +34,10 @@ static __inline int Abs(int v) {
 // ScaleARGB ARGB, 1/2
 // This is an optimized version for scaling down a ARGB to 1/2 of
 // its original size.
-static void ScaleARGBDown2(int src_width,
+static void ScaleARGBDown2(int src_width, int src_height, int dst_width,
-                           int src_height,
+                           int dst_height, ptrdiff_t src_stride,
-                           int dst_width,
+                           ptrdiff_t dst_stride, const uint8_t* src_argb,
-                           int dst_height,
+                           uint8_t* dst_argb, int x, int dx, int y, int dy,
                           ptrdiff_t src_stride,
                           ptrdiff_t dst_stride,
                           const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int x,
                           int dx,
                           int y,
                           int dy,
                           enum FilterMode filtering) {
  int j;
  ptrdiff_t row_stride = src_stride * (dy >> 16);
@ -148,18 +140,10 @@ static void ScaleARGBDown2(int src_width,
 // ScaleARGB ARGB, 1/4
 // This is an optimized version for scaling down a ARGB to 1/4 of
 // its original size.
-static int ScaleARGBDown4Box(int src_width,
+static int ScaleARGBDown4Box(int src_width, int src_height, int dst_width,
-                             int src_height,
+                             int dst_height, ptrdiff_t src_stride,
-                             int dst_width,
+                             ptrdiff_t dst_stride, const uint8_t* src_argb,
-                             int dst_height,
+                             uint8_t* dst_argb, int x, int dx, int y, int dy) {
                             ptrdiff_t src_stride,
                             ptrdiff_t dst_stride,
                             const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int x,
                             int dx,
                             int y,
                             int dy) {
  int j;
  // Allocate 2 rows of ARGB.
  const int row_size = (dst_width * 2 * 4 + 31) & ~31;
@ -222,18 +206,10 @@ static int ScaleARGBDown4Box(int src_width,
 // ScaleARGB ARGB Even
 // This is an optimized version for scaling down a ARGB to even
 // multiple of its original size.
-static void ScaleARGBDownEven(int src_width,
+static void ScaleARGBDownEven(int src_width, int src_height, int dst_width,
-                              int src_height,
+                              int dst_height, ptrdiff_t src_stride,
-                              int dst_width,
+                              ptrdiff_t dst_stride, const uint8_t* src_argb,
-                              int dst_height,
+                              uint8_t* dst_argb, int x, int dx, int y, int dy,
                              ptrdiff_t src_stride,
                              ptrdiff_t dst_stride,
                              const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int x,
                              int dx,
                              int y,
                              int dy,
                              enum FilterMode filtering) {
  int j;
  int col_step = dx >> 16;
@ -298,19 +274,11 @@ static void ScaleARGBDownEven(int src_width,
 }
 // Scale ARGB down with bilinear interpolation.
-static int ScaleARGBBilinearDown(int src_width,
+static int ScaleARGBBilinearDown(int src_width, int src_height, int dst_width,
-                                 int src_height,
+                                 int dst_height, ptrdiff_t src_stride,
-                                 int dst_width,
+                                 ptrdiff_t dst_stride, const uint8_t* src_argb,
-                                 int dst_height,
+                                 uint8_t* dst_argb, int x, int dx, int y,
-                                 ptrdiff_t src_stride,
+                                 int dy, enum FilterMode filtering) {
                                 ptrdiff_t dst_stride,
                                 const uint8_t* src_argb,
                                 uint8_t* dst_argb,
                                 int x,
                                 int dx,
                                 int y,
                                 int dy,
                                 enum FilterMode filtering) {
  int j;
  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                         ptrdiff_t src_stride, int dst_width,
@ -425,18 +393,10 @@ static int ScaleARGBBilinearDown(int src_width,
 }
 // Scale ARGB up with bilinear interpolation.
-static int ScaleARGBBilinearUp(int src_width,
+static int ScaleARGBBilinearUp(int src_width, int src_height, int dst_width,
-                               int src_height,
+                               int dst_height, ptrdiff_t src_stride,
-                               int dst_width,
+                               ptrdiff_t dst_stride, const uint8_t* src_argb,
-                               int dst_height,
+                               uint8_t* dst_argb, int x, int dx, int y, int dy,
                               ptrdiff_t src_stride,
                               ptrdiff_t dst_stride,
                               const uint8_t* src_argb,
                               uint8_t* dst_argb,
                               int x,
                               int dx,
                               int y,
                               int dy,
                               enum FilterMode filtering) {
  int j;
  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
@ -604,18 +564,10 @@ static int ScaleARGBBilinearUp(int src_width,
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
-static void ScaleARGBSimple(int src_width,
+static void ScaleARGBSimple(int src_width, int src_height, int dst_width,
-                            int src_height,
+                            int dst_height, ptrdiff_t src_stride,
-                            int dst_width,
+                            ptrdiff_t dst_stride, const uint8_t* src_argb,
-                            int dst_height,
+                            uint8_t* dst_argb, int x, int dx, int y, int dy) {
                            ptrdiff_t src_stride,
                            ptrdiff_t dst_stride,
                            const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int x,
                            int dx,
                            int y,
                            int dy) {
  int j;
  void (*ScaleARGBCols)(uint8_t* dst_argb, const uint8_t* src_argb,
                        int dst_width, int x, int dx) =
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@ -56,18 +56,10 @@ static __inline int Abs(int v) {
 // This is an optimized version for scaling down a UV to 1/2 of
 // its original size.
 #if HAS_SCALEUVDOWN2
-static void ScaleUVDown2(int src_width,
+static void ScaleUVDown2(int src_width, int src_height, int dst_width,
-                         int src_height,
+                         int dst_height, ptrdiff_t src_stride,
-                         int dst_width,
+                         ptrdiff_t dst_stride, const uint8_t* src_uv,
-                         int dst_height,
+                         uint8_t* dst_uv, int x, int dx, int y, int dy,
                         ptrdiff_t src_stride,
                         ptrdiff_t dst_stride,
                         const uint8_t* src_uv,
                         uint8_t* dst_uv,
                         int x,
                         int dx,
                         int y,
                         int dy,
                         enum FilterMode filtering) {
  int j;
  ptrdiff_t row_stride = src_stride * (dy >> 16);
@ -171,18 +163,10 @@ static void ScaleUVDown2(int src_width,
 // This is an optimized version for scaling down a UV to 1/4 of
 // its original size.
 #if HAS_SCALEUVDOWN4BOX
-static int ScaleUVDown4Box(int src_width,
+static int ScaleUVDown4Box(int src_width, int src_height, int dst_width,
-                           int src_height,
+                           int dst_height, ptrdiff_t src_stride,
-                           int dst_width,
+                           ptrdiff_t dst_stride, const uint8_t* src_uv,
-                           int dst_height,
+                           uint8_t* dst_uv, int x, int dx, int y, int dy) {
                           ptrdiff_t src_stride,
                           ptrdiff_t dst_stride,
                           const uint8_t* src_uv,
                           uint8_t* dst_uv,
                           int x,
                           int dx,
                           int y,
                           int dy) {
  int j;
  // Allocate 2 rows of UV.
  const int row_size = (dst_width * 2 * 2 + 15) & ~15;
@ -253,18 +237,10 @@ static int ScaleUVDown4Box(int src_width,
 // This is an optimized version for scaling down a UV to even
 // multiple of its original size.
 #if HAS_SCALEUVDOWNEVEN
-static void ScaleUVDownEven(int src_width,
+static void ScaleUVDownEven(int src_width, int src_height, int dst_width,
-                            int src_height,
+                            int dst_height, ptrdiff_t src_stride,
-                            int dst_width,
+                            ptrdiff_t dst_stride, const uint8_t* src_uv,
-                            int dst_height,
+                            uint8_t* dst_uv, int x, int dx, int y, int dy,
                            ptrdiff_t src_stride,
                            ptrdiff_t dst_stride,
                            const uint8_t* src_uv,
                            uint8_t* dst_uv,
                            int x,
                            int dx,
                            int y,
                            int dy,
                            enum FilterMode filtering) {
  int j;
  int col_step = dx >> 16;
@ -331,18 +307,10 @@ static void ScaleUVDownEven(int src_width,
 // Scale UV down with bilinear interpolation.
 #if HAS_SCALEUVBILINEARDOWN
-static int ScaleUVBilinearDown(int src_width,
+static int ScaleUVBilinearDown(int src_width, int src_height, int dst_width,
-                               int src_height,
+                               int dst_height, ptrdiff_t src_stride,
-                               int dst_width,
+                               ptrdiff_t dst_stride, const uint8_t* src_uv,
-                               int dst_height,
+                               uint8_t* dst_uv, int x, int dx, int y, int dy,
                               ptrdiff_t src_stride,
                               ptrdiff_t dst_stride,
                               const uint8_t* src_uv,
                               uint8_t* dst_uv,
                               int x,
                               int dx,
                               int y,
                               int dy,
                               enum FilterMode filtering) {
  int j;
  void (*InterpolateRow)(uint8_t* dst_uv, const uint8_t* src_uv,
@ -445,18 +413,10 @@ static int ScaleUVBilinearDown(int src_width,
 // Scale UV up with bilinear interpolation.
 #if HAS_SCALEUVBILINEARUP
-static int ScaleUVBilinearUp(int src_width,
+static int ScaleUVBilinearUp(int src_width, int src_height, int dst_width,
-                             int src_height,
+                             int dst_height, ptrdiff_t src_stride,
-                             int dst_width,
+                             ptrdiff_t dst_stride, const uint8_t* src_uv,
-                             int dst_height,
+                             uint8_t* dst_uv, int x, int dx, int y, int dy,
                             ptrdiff_t src_stride,
                             ptrdiff_t dst_stride,
                             const uint8_t* src_uv,
                             uint8_t* dst_uv,
                             int x,
                             int dx,
                             int y,
                             int dy,
                             enum FilterMode filtering) {
  int j;
  void (*InterpolateRow)(uint8_t* dst_uv, const uint8_t* src_uv,
@ -603,13 +563,9 @@ static int ScaleUVBilinearUp(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original width, using linear interpolation.
 // This is used to scale U and V planes of NV16 to NV24.
-static void ScaleUVLinearUp2(int src_width,
+static void ScaleUVLinearUp2(int src_width, int src_height, int dst_width,
-                             int src_height,
+                             int dst_height, ptrdiff_t src_stride,
-                             int dst_width,
+                             ptrdiff_t dst_stride, const uint8_t* src_uv,
                             int dst_height,
                             ptrdiff_t src_stride,
                             ptrdiff_t dst_stride,
                             const uint8_t* src_uv,
                             uint8_t* dst_uv) {
  void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) =
      ScaleUVRowUp2_Linear_Any_C;
@ -723,13 +679,9 @@ static void ScaleUVBilinearUp2(int src_width,
 // This is an optimized version for scaling up a plane to 2 times of
 // its original width, using linear interpolation.
 // This is used to scale U and V planes of P210 to P410.
-static void ScaleUVLinearUp2_16(int src_width,
+static void ScaleUVLinearUp2_16(int src_width, int src_height, int dst_width,
-                                int src_height,
+                                int dst_height, ptrdiff_t src_stride,
-                                int dst_width,
+                                ptrdiff_t dst_stride, const uint16_t* src_uv,
                                int dst_height,
                                ptrdiff_t src_stride,
                                ptrdiff_t dst_stride,
                                const uint16_t* src_uv,
                                uint16_t* dst_uv) {
  void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
      ScaleUVRowUp2_Linear_16_Any_C;
@ -831,18 +783,10 @@ static void ScaleUVBilinearUp2_16(int src_width,
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
-static void ScaleUVSimple(int src_width,
+static void ScaleUVSimple(int src_width, int src_height, int dst_width,
-                          int src_height,
+                          int dst_height, ptrdiff_t src_stride,
-                          int dst_width,
+                          ptrdiff_t dst_stride, const uint8_t* src_uv,
-                          int dst_height,
+                          uint8_t* dst_uv, int x, int dx, int y, int dy) {
                          ptrdiff_t src_stride,
                          ptrdiff_t dst_stride,
                          const uint8_t* src_uv,
                          uint8_t* dst_uv,
                          int x,
                          int dx,
                          int y,
                          int dy) {
  int j;
  void (*ScaleUVCols)(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width,
                      int x, int dx) =
--- a/unit_test/color_test.cc
+++ b/unit_test/color_test.cc
@ -464,8 +464,7 @@ static void YUVFToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
 static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
  double y1 = (y - 16) * 1.164384;
  *r = RoundToByte(y1 - (v - 128) * -1.67867);
-  *g = RoundToByte(y1 - (u - 128) * 0.187326 -
+  *g = RoundToByte(y1 - (u - 128) * 0.187326 - (v - 128) * 0.65042);
                   (v - 128) * 0.65042);
  *b = RoundToByte(y1 - (u - 128) * -2.14177);
 }
--- a/unit_test/convert_argb_test.cc
+++ b/unit_test/convert_argb_test.cc
@ -82,15 +82,19 @@ namespace libyuv {
        (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                    \
    const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);    \
    const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y);  \
-    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF);  \
+    align_buffer_page_end(src_y,                                               \
                          kPaddedWidth * kPaddedHeight * SRC_BPC + OFF);       \
    align_buffer_page_end(                                                     \
-        src_uv, kSrcHalfPaddedWidth* kSrcHalfPaddedHeight* SRC_BPC * 2 + OFF); \
+        src_uv,                                                                \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                  \
+        kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC * 2 + OFF);       \
-    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);    \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC);                \
-    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);    \
+    align_buffer_page_end(dst_u_c, kDstHalfWidth * kDstHalfHeight * DST_BPC);  \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);                \
+    align_buffer_page_end(dst_v_c, kDstHalfWidth * kDstHalfHeight * DST_BPC);  \
-    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC);  \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC);              \
-    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC);  \
+    align_buffer_page_end(dst_u_opt,                                           \
                          kDstHalfWidth * kDstHalfHeight * DST_BPC);           \
    align_buffer_page_end(dst_v_opt,                                           \
                          kDstHalfWidth * kDstHalfHeight * DST_BPC);           \
    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                    \
    SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                  \
    for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) {                   \
@ -101,12 +105,12 @@ namespace libyuv {
      src_uv_p[i] =                                                            \
          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
    }                                                                          \
-    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                              \
+    memset(dst_y_c, 1, kWidth * kHeight * DST_BPC);                            \
-    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);                \
+    memset(dst_u_c, 2, kDstHalfWidth * kDstHalfHeight * DST_BPC);              \
-    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);                \
+    memset(dst_v_c, 3, kDstHalfWidth * kDstHalfHeight * DST_BPC);              \
-    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                          \
+    memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC);                        \
-    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);            \
+    memset(dst_u_opt, 102, kDstHalfWidth * kDstHalfHeight * DST_BPC);          \
-    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);            \
+    memset(dst_v_opt, 103, kDstHalfWidth * kDstHalfHeight * DST_BPC);          \
    MaskCpuFlags(disable_cpu_flags_);                                          \
    SRC_FMT_PLANAR##To##FMT_PLANAR(                                            \
        src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2,                          \
@ -223,11 +227,11 @@ TESTBPTOP(P012, uint16_t, 2, 2, 2, I012, uint16_t, 2, 2, 2, 12, 1, 1)
    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                     \
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
    align_buffer_page_end(src_u, kSizeUV + OFF);                              \
    align_buffer_page_end(src_v, kSizeUV + OFF);                              \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);              \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);             \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);            \
    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
      src_y[i + OFF] = (fastrand() & 0xff);                                   \
    }                                                                         \
@ -381,58 +385,58 @@ TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
 #endif
-#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,     \
-                   W1280, N, NEG, OFF)                                         \
+                   W1280, N, NEG, OFF)                                        \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                       \
-    const int kWidth = W1280;                                                  \
+    const int kWidth = W1280;                                                 \
-    const int kHeight = benchmark_height_;                                     \
+    const int kHeight = benchmark_height_;                                    \
-    const int kStrideB = kWidth * BPP_B;                                       \
+    const int kStrideB = kWidth * BPP_B;                                      \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
-    align_buffer_page_end(src_uv,                                              \
+    align_buffer_page_end(                                                    \
-                          kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
+        src_uv, kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF);         \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight);                      \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight);                    \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight);                    \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight);                  \
-    for (int i = 0; i < kHeight; ++i)                                          \
+    for (int i = 0; i < kHeight; ++i)                                         \
-      for (int j = 0; j < kWidth; ++j)                                         \
+      for (int j = 0; j < kWidth; ++j)                                        \
-        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                     \
+        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                    \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < kStrideUV * 2; ++j) {                                \
+      for (int j = 0; j < kStrideUV * 2; ++j) {                               \
-        src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff);             \
+        src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff);            \
-      }                                                                        \
+      }                                                                       \
-    }                                                                          \
+    }                                                                         \
-    memset(dst_argb_c, 1, kStrideB* kHeight);                                  \
+    memset(dst_argb_c, 1, kStrideB * kHeight);                                \
-    memset(dst_argb_opt, 101, kStrideB* kHeight);                              \
+    memset(dst_argb_opt, 101, kStrideB * kHeight);                            \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
-    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,    \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,   \
-                          dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight);    \
+                          dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight);   \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,  \
+      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \
-                            dst_argb_opt, kWidth * BPP_B, kWidth,              \
+                            dst_argb_opt, kWidth * BPP_B, kWidth,             \
-                            NEG kHeight);                                      \
+                            NEG kHeight);                                     \
-    }                                                                          \
+    }                                                                         \
-    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */    \
+    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */   \
-    align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight);                 \
+    align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight);                \
-    align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight);               \
+    align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight);              \
-    memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                             \
+    memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                            \
-    memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                         \
+    memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                        \
-    FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,      \
+    FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,     \
-                  kHeight);                                                    \
+                  kHeight);                                                   \
-    FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth,  \
+    FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \
-                  kHeight);                                                    \
+                  kHeight);                                                   \
-    for (int i = 0; i < kHeight; ++i) {                                        \
+    for (int i = 0; i < kHeight; ++i) {                                       \
-      for (int j = 0; j < kWidth * 4; ++j) {                                   \
+      for (int j = 0; j < kWidth * 4; ++j) {                                  \
-        ASSERT_EQ(dst_argb32_c[i * kWidth * 4 + j],                            \
+        ASSERT_EQ(dst_argb32_c[i * kWidth * 4 + j],                           \
-                  dst_argb32_opt[i * kWidth * 4 + j]);                         \
+                  dst_argb32_opt[i * kWidth * 4 + j]);                        \
-      }                                                                        \
+      }                                                                       \
-    }                                                                          \
+    }                                                                         \
-    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_uv);                                      \
+    free_aligned_buffer_page_end(src_uv);                                     \
-    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_c);                                 \
-    free_aligned_buffer_page_end(dst_argb_opt);                                \
+    free_aligned_buffer_page_end(dst_argb_opt);                               \
-    free_aligned_buffer_page_end(dst_argb32_c);                                \
+    free_aligned_buffer_page_end(dst_argb32_c);                               \
-    free_aligned_buffer_page_end(dst_argb32_opt);                              \
+    free_aligned_buffer_page_end(dst_argb32_opt);                             \
  }
 #if defined(ENABLE_FULL_TESTS)
@ -507,15 +511,16 @@ TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2)
    const int kStrideB =                                                       \
        (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                 \
    align_buffer_page_end(src_argb,                                            \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);    \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeightB*(int)sizeof(TYPE_B)); \
+    align_buffer_page_end(dst_argb_c,                                          \
                          kStrideB * kHeightB * (int)sizeof(TYPE_B));          \
    align_buffer_page_end(dst_argb_opt,                                        \
-                          kStrideB* kHeightB*(int)sizeof(TYPE_B));             \
+                          kStrideB * kHeightB * (int)sizeof(TYPE_B));          \
    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
      src_argb[i + OFF] = (fastrand() & 0xff);                                 \
    }                                                                          \
-    memset(dst_argb_c, 1, kStrideB* kHeightB);                                 \
+    memset(dst_argb_c, 1, kStrideB * kHeightB);                                \
-    memset(dst_argb_opt, 101, kStrideB* kHeightB);                             \
+    memset(dst_argb_opt, 101, kStrideB * kHeightB);                            \
    MaskCpuFlags(disable_cpu_flags_);                                          \
    FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_B*)dst_argb_c, \
                     kStrideB, kWidth, NEG kHeight);                           \
@ -532,41 +537,42 @@ TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2)
    free_aligned_buffer_page_end(dst_argb_opt);                                \
  }
-#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B,        \
+#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B,     \
-                       TYPE_B, EPP_B, STRIDE_B, HEIGHT_B)                      \
+                       TYPE_B, EPP_B, STRIDE_B, HEIGHT_B)                   \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                       \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                    \
-    for (int times = 0; times < benchmark_iterations_; ++times) {              \
+    for (int times = 0; times < benchmark_iterations_; ++times) {           \
-      const int kWidth = (fastrand() & 63) + 1;                                \
+      const int kWidth = (fastrand() & 63) + 1;                             \
-      const int kHeight = (fastrand() & 31) + 1;                               \
+      const int kHeight = (fastrand() & 31) + 1;                            \
-      const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;     \
+      const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;  \
-      const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;     \
+      const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;  \
-      const int kStrideA =                                                     \
+      const int kStrideA =                                                  \
-          (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
+          (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;            \
-      const int kStrideB =                                                     \
+      const int kStrideB =                                                  \
-          (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
+          (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;            \
-      align_buffer_page_end(src_argb, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
+      align_buffer_page_end(src_argb,                                       \
-      align_buffer_page_end(dst_argb_c,                                        \
+                            kStrideA * kHeightA * (int)sizeof(TYPE_A));     \
-                            kStrideB* kHeightB*(int)sizeof(TYPE_B));           \
+      align_buffer_page_end(dst_argb_c,                                     \
-      align_buffer_page_end(dst_argb_opt,                                      \
+                            kStrideB * kHeightB * (int)sizeof(TYPE_B));     \
-                            kStrideB* kHeightB*(int)sizeof(TYPE_B));           \
+      align_buffer_page_end(dst_argb_opt,                                   \
-      for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {    \
+                            kStrideB * kHeightB * (int)sizeof(TYPE_B));     \
-        src_argb[i] = 0xfe;                                                    \
+      for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \
-      }                                                                        \
+        src_argb[i] = 0xfe;                                                 \
-      memset(dst_argb_c, 123, kStrideB* kHeightB);                             \
+      }                                                                     \
-      memset(dst_argb_opt, 123, kStrideB* kHeightB);                           \
+      memset(dst_argb_c, 123, kStrideB * kHeightB);                         \
-      MaskCpuFlags(disable_cpu_flags_);                                        \
+      memset(dst_argb_opt, 123, kStrideB * kHeightB);                       \
-      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c,       \
+      MaskCpuFlags(disable_cpu_flags_);                                     \
-                       kStrideB, kWidth, kHeight);                             \
+      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c,    \
-      MaskCpuFlags(benchmark_cpu_info_);                                       \
+                       kStrideB, kWidth, kHeight);                          \
-      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt,     \
+      MaskCpuFlags(benchmark_cpu_info_);                                    \
-                       kStrideB, kWidth, kHeight);                             \
+      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt,  \
-      for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {    \
+                       kStrideB, kWidth, kHeight);                          \
-        ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
+      for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) { \
-      }                                                                        \
+        ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                          \
-      free_aligned_buffer_page_end(src_argb);                                  \
+      }                                                                     \
-      free_aligned_buffer_page_end(dst_argb_c);                                \
+      free_aligned_buffer_page_end(src_argb);                               \
-      free_aligned_buffer_page_end(dst_argb_opt);                              \
+      free_aligned_buffer_page_end(dst_argb_c);                             \
-    }                                                                          \
+      free_aligned_buffer_page_end(dst_argb_opt);                           \
    }                                                                       \
  }
 #if defined(ENABLE_FULL_TESTS)
@ -672,11 +678,11 @@ TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
    const int kStrideB =                                                      \
        (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                \
    align_buffer_page_end(src_argb,                                           \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);   \
    align_buffer_page_end(dst_argb_c,                                         \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);   \
    align_buffer_page_end(dst_argb_opt,                                       \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);   \
    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {     \
      src_argb[i + OFF] = (fastrand() & 0xff);                                \
    }                                                                         \
@ -791,14 +797,14 @@ TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
        (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
    const int kStrideB =                                                     \
        (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
-    align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF);               \
+    align_buffer_page_end(src_argb, kStrideA * kHeightA + OFF);              \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);                   \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                  \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);                 \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                \
    for (int i = 0; i < kStrideA * kHeightA; ++i) {                          \
      src_argb[i + OFF] = (fastrand() & 0xff);                               \
    }                                                                        \
-    memset(dst_argb_c, 1, kStrideB* kHeightB);                               \
+    memset(dst_argb_c, 1, kStrideB * kHeightB);                              \
-    memset(dst_argb_opt, 101, kStrideB* kHeightB);                           \
+    memset(dst_argb_opt, 101, kStrideB * kHeightB);                          \
    MaskCpuFlags(disable_cpu_flags_);                                        \
    FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, \
                             NULL, kWidth, NEG kHeight);                     \
@ -827,14 +833,14 @@ TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
          (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
      const int kStrideB =                                                     \
          (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
-      align_buffer_page_end(src_argb, kStrideA* kHeightA);                     \
+      align_buffer_page_end(src_argb, kStrideA * kHeightA);                    \
-      align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);                   \
+      align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                  \
-      align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);                 \
+      align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                \
      for (int i = 0; i < kStrideA * kHeightA; ++i) {                          \
        src_argb[i] = (fastrand() & 0xff);                                     \
      }                                                                        \
-      memset(dst_argb_c, 123, kStrideB* kHeightB);                             \
+      memset(dst_argb_c, 123, kStrideB * kHeightB);                            \
-      memset(dst_argb_opt, 123, kStrideB* kHeightB);                           \
+      memset(dst_argb_opt, 123, kStrideB * kHeightB);                          \
      MaskCpuFlags(disable_cpu_flags_);                                        \
      FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_c, kStrideB, NULL, \
                               kWidth, kHeight);                               \
@ -885,15 +891,16 @@ TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
    const int kStrideA =                                                       \
        (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                 \
    align_buffer_page_end(src_argb,                                            \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);    \
-    align_buffer_page_end(dst_argb_c, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
+    align_buffer_page_end(dst_argb_c,                                          \
                          kStrideA * kHeightA * (int)sizeof(TYPE_A));          \
    align_buffer_page_end(dst_argb_opt,                                        \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A));             \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A));          \
    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
      src_argb[i + OFF] = (fastrand() & 0xff);                                 \
    }                                                                          \
-    memset(dst_argb_c, 1, kStrideA* kHeightA);                                 \
+    memset(dst_argb_c, 1, kStrideA * kHeightA);                                \
-    memset(dst_argb_opt, 101, kStrideA* kHeightA);                             \
+    memset(dst_argb_opt, 101, kStrideA * kHeightA);                            \
    MaskCpuFlags(disable_cpu_flags_);                                          \
    FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_c,         \
             kStrideA, kWidth, NEG kHeight);                                   \
@ -945,12 +952,12 @@ TESTEND(AB64ToAR64, uint16_t, 4, 4, 1)
    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                      \
    align_buffer_page_end(src_u, kSizeUV + OFF);                               \
    align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_a, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_a, kWidth * kHeight + OFF);                      \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);             \
    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
      src_y[i + OFF] = (fastrand() & 0xff);                                    \
      src_a[i + OFF] = (fastrand() & 0xff);                                    \
@ -1240,11 +1247,11 @@ TEST_F(LibYUVConvertTest, TestDither) {
    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                      \
    align_buffer_page_end(src_u, kSizeUV + OFF);                               \
    align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);             \
    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
      src_y[i + OFF] = (fastrand() & 0xff);                                    \
    }                                                                          \
@ -1265,10 +1272,10 @@ TEST_F(LibYUVConvertTest, TestDither) {
          dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight);            \
    }                                                                          \
    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */    \
-    align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight);               \
+    align_buffer_page_end(dst_argb32_c, kWidth * BPP_C * kHeight);             \
-    align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight);             \
+    align_buffer_page_end(dst_argb32_opt, kWidth * BPP_C * kHeight);           \
-    memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight);                           \
+    memset(dst_argb32_c, 2, kWidth * BPP_C * kHeight);                         \
-    memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight);                       \
+    memset(dst_argb32_opt, 102, kWidth * BPP_C * kHeight);                     \
    FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \
                     kWidth, kHeight);                                         \
    FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt,             \
@ -1317,10 +1324,10 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
    const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                    \
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
    align_buffer_page_end(src_u, kSizeUV + OFF);                              \
    align_buffer_page_end(src_v, kSizeUV + OFF);                              \
-    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF);              \
    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
      src_y[i + OFF] = (fastrand() & 0xff);                                   \
    }                                                                         \
@ -1334,8 +1341,8 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
                          kWidth, NEG kHeight);                               \
    /* Convert to a 3rd format in 1 step and 2 steps and compare  */          \
    const int kStrideC = kWidth * BPP_C;                                      \
-    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF);              \
-    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF);             \
    memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                          \
    memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                         \
    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
@ -1464,14 +1471,14 @@ TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
    const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                     \
    const int kSizeUV =                                                        \
        SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y);          \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                      \
    align_buffer_page_end(src_u, kSizeUV + OFF);                               \
    align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_a, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_a, kWidth * kHeight + OFF);                      \
-    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF);               \
    const int kStrideC = kWidth * BPP_C;                                       \
-    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF);              \
    memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                           \
    memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                           \
    memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                          \
@ -1578,16 +1585,16 @@ TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
    const int kHeight = benchmark_height_;                                     \
    const int kStrideA = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
    const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                     \
-    align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF);                \
+    align_buffer_page_end(src_argb_a, kStrideA * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF);               \
    MemRandomize(src_argb_a + OFF, kStrideA * kHeight);                        \
    memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                           \
    FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB,   \
                     kWidth, NEG kHeight);                                     \
    /* Convert to a 3rd format in 1 step and 2 steps and compare  */           \
    const int kStrideC = kWidth * BPP_C;                                       \
-    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF);              \
    memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                           \
    memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                          \
    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
@ -1798,11 +1805,11 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
    const int kBpc = 2;                                                       \
-    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);               \
+    align_buffer_page_end(src_y, kWidth * kHeight * kBpc + SOFF);             \
-    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(src_u, kSizeUV * kBpc + SOFF);                      \
-    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(src_v, kSizeUV * kBpc + SOFF);                      \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);              \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + DOFF);             \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);            \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + DOFF);           \
    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
      reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & FMT_MASK); \
    }                                                                         \
@ -1913,12 +1920,12 @@ TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30Filter, 4, 4, 1)
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
    const int kBpc = 2;                                                        \
-    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF);                 \
+    align_buffer_page_end(src_y, kWidth * kHeight * kBpc + OFF);               \
-    align_buffer_page_end(src_u, kSizeUV* kBpc + OFF);                         \
+    align_buffer_page_end(src_u, kSizeUV * kBpc + OFF);                        \
-    align_buffer_page_end(src_v, kSizeUV* kBpc + OFF);                         \
+    align_buffer_page_end(src_v, kSizeUV * kBpc + OFF);                        \
-    align_buffer_page_end(src_a, kWidth* kHeight* kBpc + OFF);                 \
+    align_buffer_page_end(src_a, kWidth * kHeight * kBpc + OFF);               \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);             \
    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
      reinterpret_cast<uint16_t*>(src_y + OFF)[i] =                            \
          (fastrand() & ((1 << S_DEPTH) - 1));                                 \
@ -2146,10 +2153,10 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2;                    \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;         \
    const int kBpc = 2;                                                        \
-    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);                \
+    align_buffer_page_end(src_y, kWidth * kHeight * kBpc + SOFF);              \
-    align_buffer_page_end(src_uv, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(src_uv, kSizeUV * kBpc + SOFF);                      \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);               \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + DOFF);              \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);             \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + DOFF);            \
    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
      reinterpret_cast<uint16_t*>(src_y + SOFF)[i] =                           \
          (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH)));                 \
@ -2839,8 +2846,12 @@ TEST_F(LibYUVConvertTest, TestARGBToUVMatrixRow_Opt) {
        int half_width = (width + 1) / 2;
        for (int i = 0; i < half_width; ++i) {
-          ASSERT_EQ(dest_u_c[i], dest_u_opt[i]) << "u mismatch at " << i << " width " << width << " height " << height;
+          ASSERT_EQ(dest_u_c[i], dest_u_opt[i])
-          ASSERT_EQ(dest_v_c[i], dest_v_opt[i]) << "v mismatch at " << i << " width " << width << " height " << height;
+              << "u mismatch at " << i << " width " << width << " height "
              << height;
          ASSERT_EQ(dest_v_c[i], dest_v_opt[i])
              << "v mismatch at " << i << " width " << width << " height "
              << height;
        }
      }
    }
@ -2903,13 +2914,12 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
  free_aligned_buffer_page_end(dest_argb);
  free_aligned_buffer_page_end(orig_i400);
 }
-#endif // DISABLE_SLOW_TESTS
+#endif  // DISABLE_SLOW_TESTS
 #endif  // !defined(DISABLE_SLOW_TESTS) && \
        // (defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__))
 #endif  // !defined(LEAN_TESTS)
 #define TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
                   SUBSAMP_Y, W1280, N, NEG, OFF)                              \
  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                             \
@ -2922,17 +2932,17 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2;                    \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
    align_buffer_page_end(src_argb,                                            \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);    \
-    align_buffer_page_end(dst_y_c, kStrideY* kHeight);                         \
+    align_buffer_page_end(dst_y_c, kStrideY * kHeight);                        \
    align_buffer_page_end(dst_uv_c, kSizeUV);                                  \
-    align_buffer_page_end(dst_y_opt, kStrideY* kHeight);                       \
+    align_buffer_page_end(dst_y_opt, kStrideY * kHeight);                      \
    align_buffer_page_end(dst_uv_opt, kSizeUV);                                \
    for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
      src_argb[i + OFF] = (fastrand() & 0xff);                                 \
    }                                                                          \
-    memset(dst_y_c, 1, kStrideY* kHeight);                                     \
+    memset(dst_y_c, 1, kStrideY * kHeight);                                    \
    memset(dst_uv_c, 2, kSizeUV);                                              \
-    memset(dst_y_opt, 101, kStrideY* kHeight);                                 \
+    memset(dst_y_opt, 101, kStrideY * kHeight);                                \
    memset(dst_uv_opt, 102, kSizeUV);                                          \
    MaskCpuFlags(disable_cpu_flags_);                                          \
    FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, dst_y_c, kStrideY,   \
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -78,17 +78,19 @@ namespace libyuv {
    const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
+    align_buffer_page_end(src_y, kWidth * kHeight * SRC_BPC + OFF);           \
    align_buffer_page_end(src_u,                                              \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+                          kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF);    \
    align_buffer_page_end(src_v,                                              \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+                          kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF);    \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC);               \
-    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
+    align_buffer_page_end(dst_u_c, kDstHalfWidth * kDstHalfHeight * DST_BPC); \
-    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
+    align_buffer_page_end(dst_v_c, kDstHalfWidth * kDstHalfHeight * DST_BPC); \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC);             \
-    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+    align_buffer_page_end(dst_u_opt,                                          \
-    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+                          kDstHalfWidth * kDstHalfHeight * DST_BPC);          \
    align_buffer_page_end(dst_v_opt,                                          \
                          kDstHalfWidth * kDstHalfHeight * DST_BPC);          \
    MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
    MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
    MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
@ -102,12 +104,12 @@ namespace libyuv {
      src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1);                       \
      src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1);                       \
    }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_y_c, 1, kWidth * kHeight * DST_BPC);                           \
-    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
+    memset(dst_u_c, 2, kDstHalfWidth * kDstHalfHeight * DST_BPC);             \
-    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
+    memset(dst_v_c, 3, kDstHalfWidth * kDstHalfHeight * DST_BPC);             \
-    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC);                       \
-    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
+    memset(dst_u_opt, 102, kDstHalfWidth * kDstHalfHeight * DST_BPC);         \
-    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
+    memset(dst_v_opt, 103, kDstHalfWidth * kDstHalfHeight * DST_BPC);         \
    MaskCpuFlags(disable_cpu_flags_);                                         \
    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
        src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,      \
@ -212,15 +214,15 @@ TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)
    const int kHeight = benchmark_height_;                                    \
    const int kSizeUV =                                                       \
        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
    align_buffer_page_end(src_uv,                                             \
-                          kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);       \
+                          kSizeUV * ((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);     \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight);                         \
    align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
    align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                       \
    align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
    align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
@ -239,12 +241,12 @@ TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)
            (fastrand() & 0xff);                                              \
      }                                                                       \
    }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight);                                      \
+    memset(dst_y_c, 1, kWidth * kHeight);                                     \
    memset(dst_u_c, 2,                                                        \
           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
    memset(dst_v_c, 3,                                                        \
           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
+    memset(dst_y_opt, 101, kWidth * kHeight);                                 \
    memset(dst_u_opt, 102,                                                    \
           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
    memset(dst_v_opt, 103,                                                    \
@ -359,17 +361,17 @@ static int I400ToNV21(const uint8_t* src_y,
    const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
+    align_buffer_page_end(src_y, kWidth * kHeight * SRC_BPC + OFF);           \
    align_buffer_page_end(src_u,                                              \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+                          kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF);    \
    align_buffer_page_end(src_v,                                              \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+                          kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF);    \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC);               \
    align_buffer_page_end(dst_uv_c,                                           \
-                          kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);        \
+                          kDstHalfWidth * kDstHalfHeight * DST_BPC * 2);      \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC);             \
    align_buffer_page_end(dst_uv_opt,                                         \
-                          kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);        \
+                          kDstHalfWidth * kDstHalfHeight * DST_BPC * 2);      \
    MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
    MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
    MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
@ -383,10 +385,10 @@ static int I400ToNV21(const uint8_t* src_y,
      src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1);                       \
      src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1);                       \
    }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_y_c, 1, kWidth * kHeight * DST_BPC);                           \
-    memset(dst_uv_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);          \
+    memset(dst_uv_c, 2, kDstHalfWidth * kDstHalfHeight * DST_BPC * 2);        \
-    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC);                       \
-    memset(dst_uv_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);      \
+    memset(dst_uv_opt, 102, kDstHalfWidth * kDstHalfHeight * DST_BPC * 2);    \
    MaskCpuFlags(disable_cpu_flags_);                                         \
    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y_p, kWidth, src_u_p, kSrcHalfWidth,   \
                                   src_v_p, kSrcHalfWidth,                    \
@ -478,14 +480,15 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
        (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                   \
    const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);   \
    const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \
-    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \
+    align_buffer_page_end(src_y,                                              \
                          kPaddedWidth * kPaddedHeight * SRC_BPC + OFF);      \
    align_buffer_page_end(                                                    \
        src_uv,                                                               \
        2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF);      \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC);               \
    align_buffer_page_end(dst_uv_c,                                           \
                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);      \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC);             \
    align_buffer_page_end(dst_uv_opt,                                         \
                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);      \
    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
@ -502,13 +505,13 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
      src_uv_p[i] =                                                           \
          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));      \
    }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_y_c, 1, kWidth * kHeight * DST_BPC);                           \
    memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);        \
-    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC);                       \
    memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);    \
    MaskCpuFlags(disable_cpu_flags_);                                         \
    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
-        src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p,              \
+        src_y_p, kWidth * SRC_BPC / (int)sizeof(SRC_T), src_uv_p,             \
        2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T),                     \
        DOY ? reinterpret_cast<DST_T*>(dst_y_c) : NULL, kWidth,               \
        reinterpret_cast<DST_T*>(dst_uv_c), 2 * kDstHalfWidth, kWidth,        \
@ -516,7 +519,7 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
    MaskCpuFlags(benchmark_cpu_info_);                                        \
    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
-          src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p,            \
+          src_y_p, kWidth * SRC_BPC / (int)sizeof(SRC_T), src_uv_p,           \
          2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T),                   \
          DOY ? reinterpret_cast<DST_T*>(dst_y_opt) : NULL, kWidth,           \
          reinterpret_cast<DST_T*>(dst_uv_opt), 2 * kDstHalfWidth, kWidth,    \
@ -598,16 +601,16 @@ TESTBPTOBP(P010, uint16_t, 2, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1)
    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
    const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
-    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
+    align_buffer_page_end(src_argb, kStride * kHeight + OFF);                  \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight);                          \
    align_buffer_page_end(dst_uv_c,                                            \
                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                        \
    align_buffer_page_end(dst_uv_opt,                                          \
                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    memset(dst_y_c, 1, kWidth* kHeight);                                       \
+    memset(dst_y_c, 1, kWidth * kHeight);                                      \
    memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                   \
+    memset(dst_y_opt, 101, kWidth * kHeight);                                  \
    memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));    \
    for (int i = 0; i < kHeight; ++i)                                          \
      for (int j = 0; j < kStride; ++j)                                        \
@ -691,20 +694,20 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1)
    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
    const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
-    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
+    align_buffer_page_end(src_argb, kStride * kHeight + OFF);                  \
-    align_buffer_page_end(dst_a_c, kWidth* kHeight);                           \
+    align_buffer_page_end(dst_a_c, kWidth * kHeight);                          \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight);                          \
    align_buffer_page_end(dst_uv_c,                                            \
                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    align_buffer_page_end(dst_a_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_a_opt, kWidth * kHeight);                        \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                        \
    align_buffer_page_end(dst_uv_opt,                                          \
                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    memset(dst_a_c, 1, kWidth* kHeight);                                       \
+    memset(dst_a_c, 1, kWidth * kHeight);                                      \
-    memset(dst_y_c, 2, kWidth* kHeight);                                       \
+    memset(dst_y_c, 2, kWidth * kHeight);                                      \
    memset(dst_uv_c, 3, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    memset(dst_a_opt, 101, kWidth* kHeight);                                   \
+    memset(dst_a_opt, 101, kWidth * kHeight);                                  \
-    memset(dst_y_opt, 102, kWidth* kHeight);                                   \
+    memset(dst_y_opt, 102, kWidth * kHeight);                                  \
    memset(dst_uv_opt, 103, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));    \
    for (int i = 0; i < kHeight; ++i)                                          \
      for (int j = 0; j < kStride; ++j)                                        \
@ -765,19 +768,19 @@ TESTATOPLANARA(ARGB, 4, 1, I420Alpha, 2, 2)
    const int kHeight = benchmark_height_;                                    \
    const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
-    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                  \
+    align_buffer_page_end(src_argb, kStride * kHeight + OFF);                 \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight);                         \
    align_buffer_page_end(dst_uv_c,                                           \
                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                       \
    align_buffer_page_end(dst_uv_opt,                                         \
                          kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
    for (int i = 0; i < kHeight; ++i)                                         \
      for (int j = 0; j < kStride; ++j)                                       \
        src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);              \
-    memset(dst_y_c, 1, kWidth* kHeight);                                      \
+    memset(dst_y_c, 1, kWidth * kHeight);                                     \
    memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));       \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
+    memset(dst_y_opt, 101, kWidth * kHeight);                                 \
    memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));   \
    MaskCpuFlags(disable_cpu_flags_);                                         \
    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
@ -1950,17 +1953,17 @@ TEST_F(LibYUVConvertTest, I420CropOddY) {
    const int kHeight = benchmark_height_;                                    \
                                                                              \
    align_buffer_page_end(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight);     \
-    align_buffer_page_end(orig_y, kWidth* kHeight);                           \
+    align_buffer_page_end(orig_y, kWidth * kHeight);                          \
    align_buffer_page_end(orig_u,                                             \
                          SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));      \
    align_buffer_page_end(orig_v,                                             \
                          SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));      \
                                                                              \
-    align_buffer_page_end(dst_y_orig, kWidth* kHeight);                       \
+    align_buffer_page_end(dst_y_orig, kWidth * kHeight);                      \
    align_buffer_page_end(dst_uv_orig,                                        \
                          2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));  \
                                                                              \
-    align_buffer_page_end(dst_y, kWidth* kHeight);                            \
+    align_buffer_page_end(dst_y, kWidth * kHeight);                           \
    align_buffer_page_end(dst_uv,                                             \
                          2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));  \
                                                                              \
@ -2423,6 +2426,129 @@ TEST_F(LibYUVConvertTest, TestARGBToI444Matrix) {
  free_aligned_buffer_page_end(ref_v);
 }
 template <typename ConvertToYUV, typename ConvertToARGB>
 static void TestRGBToI420(ConvertToYUV convert_to_yuv,
                          ConvertToARGB convert_to_argb, int width, int height,
                          int disable_cpu_flags, int benchmark_cpu_info) {
  align_buffer_page_end(src_rgb, width * height * 4);
  align_buffer_page_end(dst_y, width * height);
  align_buffer_page_end(dst_u, (width + 1) / 2 * (height + 1) / 2);
  align_buffer_page_end(dst_v, (width + 1) / 2 * (height + 1) / 2);
  align_buffer_page_end(tmp_argb, width * height * 4);
  align_buffer_page_end(ref_y, width * height);
  align_buffer_page_end(ref_u, (width + 1) / 2 * (height + 1) / 2);
  align_buffer_page_end(ref_v, (width + 1) / 2 * (height + 1) / 2);
  MemRandomize(src_rgb, width * height * 4);
  {
    SCOPED_TRACE("C_Version");
    MaskCpuFlags(disable_cpu_flags);
    // Clear buffers
    memset(dst_y, 0, width * height);
    memset(dst_u, 0, (width + 1) / 2 * (height + 1) / 2);
    memset(dst_v, 0, (width + 1) / 2 * (height + 1) / 2);
    memset(ref_y, 0, width * height);
    memset(ref_u, 0, (width + 1) / 2 * (height + 1) / 2);
    memset(ref_v, 0, (width + 1) / 2 * (height + 1) / 2);
    memset(tmp_argb, 0, width * height * 4);
    int r1 =
        convert_to_yuv(src_rgb, width * 4, dst_y, width, dst_u, (width + 1) / 2,
                       dst_v, (width + 1) / 2, width, height);
    ASSERT_EQ(r1, 0);
    int r2 =
        convert_to_argb(src_rgb, width * 4, tmp_argb, width * 4, width, height);
    ASSERT_EQ(r2, 0);
    int r3 = ARGBToI420(tmp_argb, width * 4, ref_y, width, ref_u,
                        (width + 1) / 2, ref_v, (width + 1) / 2, width, height);
    ASSERT_EQ(r3, 0);
    for (int i = 0; i < width * height; ++i) {
      ASSERT_EQ(dst_y[i], ref_y[i]);
    }
    for (int i = 0; i < (width + 1) / 2 * (height + 1) / 2; ++i) {
      ASSERT_EQ(dst_u[i], ref_u[i]);
      ASSERT_EQ(dst_v[i], ref_v[i]);
    }
  }
  {
    SCOPED_TRACE("SIMD_Version");
    MaskCpuFlags(benchmark_cpu_info);
    // Clear buffers
    memset(dst_y, 0, width * height);
    memset(dst_u, 0, (width + 1) / 2 * (height + 1) / 2);
    memset(dst_v, 0, (width + 1) / 2 * (height + 1) / 2);
    memset(ref_y, 0, width * height);
    memset(ref_u, 0, (width + 1) / 2 * (height + 1) / 2);
    memset(ref_v, 0, (width + 1) / 2 * (height + 1) / 2);
    memset(tmp_argb, 0, width * height * 4);
    int r1 =
        convert_to_yuv(src_rgb, width * 4, dst_y, width, dst_u, (width + 1) / 2,
                       dst_v, (width + 1) / 2, width, height);
    ASSERT_EQ(r1, 0);
    int r2 =
        convert_to_argb(src_rgb, width * 4, tmp_argb, width * 4, width, height);
    ASSERT_EQ(r2, 0);
    int r3 = ARGBToI420(tmp_argb, width * 4, ref_y, width, ref_u,
                        (width + 1) / 2, ref_v, (width + 1) / 2, width, height);
    ASSERT_EQ(r3, 0);
    for (int i = 0; i < width * height; ++i) {
      ASSERT_EQ(dst_y[i], ref_y[i]);
    }
    for (int i = 0; i < (width + 1) / 2 * (height + 1) / 2; ++i) {
      ASSERT_EQ(dst_u[i], ref_u[i]);
      ASSERT_EQ(dst_v[i], ref_v[i]);
    }
  }
  free_aligned_buffer_page_end(src_rgb);
  free_aligned_buffer_page_end(dst_y);
  free_aligned_buffer_page_end(dst_u);
  free_aligned_buffer_page_end(dst_v);
  free_aligned_buffer_page_end(tmp_argb);
  free_aligned_buffer_page_end(ref_y);
  free_aligned_buffer_page_end(ref_u);
  free_aligned_buffer_page_end(ref_v);
 }
 TEST_F(LibYUVConvertTest, BGRAToI420_Check) {
  TestRGBToI420(BGRAToI420, BGRAToARGB, 16, 16, disable_cpu_flags_,
                benchmark_cpu_info_);
  TestRGBToI420(BGRAToI420, BGRAToARGB, 17, 17, disable_cpu_flags_,
                benchmark_cpu_info_);
  TestRGBToI420(BGRAToI420, BGRAToARGB, 1280, 720, disable_cpu_flags_,
                benchmark_cpu_info_);
 }
 TEST_F(LibYUVConvertTest, RGBAToI420_Check) {
  TestRGBToI420(RGBAToI420, RGBAToARGB, 16, 16, disable_cpu_flags_,
                benchmark_cpu_info_);
  TestRGBToI420(RGBAToI420, RGBAToARGB, 17, 17, disable_cpu_flags_,
                benchmark_cpu_info_);
  TestRGBToI420(RGBAToI420, RGBAToARGB, 1280, 720, disable_cpu_flags_,
                benchmark_cpu_info_);
 }
 TEST_F(LibYUVConvertTest, ABGRToI420_Check) {
  TestRGBToI420(ABGRToI420, ABGRToARGB, 16, 16, disable_cpu_flags_,
                benchmark_cpu_info_);
  TestRGBToI420(ABGRToI420, ABGRToARGB, 17, 17, disable_cpu_flags_,
                benchmark_cpu_info_);
  TestRGBToI420(ABGRToI420, ABGRToARGB, 1280, 720, disable_cpu_flags_,
                benchmark_cpu_info_);
 }
 #endif  // !defined(LEAN_TESTS)
 }  // namespace libyuv
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -1212,10 +1212,10 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane_16) {
        (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                \
    const int kStrideB =                                                      \
        (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                \
-    align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF);               \
+    align_buffer_page_end(src_argb_a, kStrideA * kHeight + OFF);              \
-    align_buffer_page_end(src_argb_b, kStrideA* kHeight + OFF);               \
+    align_buffer_page_end(src_argb_b, kStrideA * kHeight + OFF);              \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight);                     \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight);                    \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight);                   \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight);                  \
    for (int i = 0; i < kStrideA * kHeight; ++i) {                            \
      src_argb_a[i + OFF] = (fastrand() & 0xff);                              \
      src_argb_b[i + OFF] = (fastrand() & 0xff);                              \
--- a/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
@ -495,15 +495,15 @@ TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
    const int kHeight = benchmark_height_;                                    \
    const int kSizeUV =                                                       \
        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
    align_buffer_page_end(src_uv,                                             \
-                          kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);       \
+                          kSizeUV * ((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);     \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight);                         \
    align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
    align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
                                       SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                       \
    align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
                                         SUBSAMPLE(kHeight, SUBSAMP_Y));      \
    align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
@ -522,12 +522,12 @@ TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
            (fastrand() & 0xff);                                              \
      }                                                                       \
    }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight);                                      \
+    memset(dst_y_c, 1, kWidth * kHeight);                                     \
    memset(dst_u_c, 2,                                                        \
           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
    memset(dst_v_c, 3,                                                        \
           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
+    memset(dst_y_opt, 101, kWidth * kHeight);                                 \
    memset(dst_u_opt, 102,                                                    \
           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
    memset(dst_v_opt, 103,                                                    \
--- a/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@ -430,14 +430,10 @@ static void FillRamp(uint8_t* buf,
 }
 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
-static void YUVToARGBTestFilter(int src_width,
+static void YUVToARGBTestFilter(int src_width, int src_height, int dst_width,
-                               int src_height,
+                                int dst_height, FilterMode f,
-                               int dst_width,
+                                int benchmark_iterations, int error_threshold,
-                               int dst_height,
+                                int* max_diff_out) {
                               FilterMode f,
                               int benchmark_iterations,
                               int error_threshold,
                               int* max_diff_out) {
  int64_t src_y_plane_size = Abs(src_width) * Abs(src_height);
  int64_t src_uv_plane_size =
      ((Abs(src_width) + 1) / 2) * ((Abs(src_height) + 1) / 2);
@ -516,10 +512,10 @@ TEST_F(LibYUVScaleTest, YUVToRGBScaleUp) {
 TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
  int diff = 0;
-  YUVToARGBTestFilter(
+  YUVToARGBTestFilter(benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2,
-      benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2, benchmark_width_,
+                      benchmark_width_, benchmark_height_,
-      benchmark_height_, libyuv::kFilterBilinear, benchmark_iterations_, 10,
+                      libyuv::kFilterBilinear, benchmark_iterations_, 10,
-      &diff);
+                      &diff);
  ASSERT_LE(diff, 10);
 }