diff --git a/GEMINI.md b/GEMINI.md
index 3bda686fd..03cdc986d 100644
--- a/GEMINI.md
+++ b/GEMINI.md
@@ -1,44 +1,62 @@
 # Gemini Project Context: libyuv Row Functions
 
-This file provides context for the core row-processing architecture of libyuv. Use these guidelines when refactoring, reviewing, or generating code within the `row_*.cc` files.
+This file provides context for the core row-processing architecture of
+libyuv. Use these guidelines when refactoring, reviewing, or generating
+code within the `row_*.cc` files.
 
 ## Architectural Overview
 
-Libyuv uses a dispatch system where high-level conversion functions call optimized "Row" functions. These functions are categorized by SIMD architecture and compiler compatibility.
+Libyuv uses a dispatch system where high-level conversion functions call
+optimized "Row" functions. These functions are categorized by SIMD architecture
+and compiler compatibility.
 
 ## Source File Map
 
 ### x86 Architectures (32-bit and 64-bit)
 
-*   **row_gcc.cc**: **Master copy.** Contains inline assembly in GCC syntax for GCC and Clang. Supports AVX, and AVX512. AVX512 implementations are strictly for 64-bit targets.
-*   **row_win.cc**: Derivative of `row_gcc.cc`. Contains C++ intrinsics specifically for Visual C++ (MSVC). Can be tested with Clang using `-DLIBYUV_ENABLE_ROWWIN`.
+*   **row_gcc.cc**: **Master copy.** Contains inline assembly in GCC syntax for
+    GCC and Clang. Supports AVX, and AVX512. AVX512 implementations are strictly
+    for 64-bit targets.
+*   **row_win.cc**: Derivative of `row_gcc.cc`. Contains C++ intrinsics
+    specifically for Visual C++ (MSVC). Can be tested with Clang using
+    `-DLIBYUV_ENABLE_ROWWIN`.
 *   **Note**: Use either `row_gcc` or `row_win`, never both.
 
 ### ARM Architectures
 
-*   **row_neon.cc**: 32-bit ARM. Written entirely in inline assembly for GCC/Clang.
-*   **row_neon64.cc**: 64-bit ARM (AArch64). Written entirely in inline assembly for GCC/Clang.
+*   **row_neon.cc**: 32-bit ARM. Written entirely in inline assembly for
+    GCC/Clang.
+*   **row_neon64.cc**: 64-bit ARM (AArch64). Written entirely in inline assembly
+    for GCC/Clang.
 *   **row_sve.cc**: ARMv9 Scalable Vector Extensions (SVE).
-*   **row_sme.cc**: ARMv9 Scalable Matrix Extension (SME) and Streaming SVE (SSVE).
+*   **row_sme.cc**: ARMv9 Scalable Matrix Extension (SME) and Streaming SVE
+    (SSVE).
 
 ### Other Architectures
 
-*   **row_rvv.cc**: RISC-V Vector (RVV). Implemented using intrinsics. Optimized for SiFive X280.
+*   **row_rvv.cc**: RISC-V Vector (RVV). Implemented using intrinsics. Optimized
+    for SiFive X280.
 *   **row_lsx.cc / row_lasx.cc**: Loongarch MIPS-like extensions.
 
 ### Utility and Fallbacks
 
-*   **row_common.cc**: Portable C/C++ versions. This is the reference implementation.
-*   **row_any.cc**: Handles "remainder" pixels for widths not multiples of SIMD register size. Used for x86, NEON, and MIPS. Not required for SVE, SME, or RVV due to hardware-level masking.
+*   **row_common.cc**: Portable C/C++ versions. This is the reference
+    implementation.
+*   **row_any.cc**: Handles "remainder" pixels for widths not multiples of SIMD
+    register size. Used for x86, NEON, and MIPS. Not required for SVE, SME, or
+    RVV due to hardware-level masking.
 
 ## Coding Guidelines
 
-1.  **AVX512 Logic**: AVX512 row functions are strictly enabled for **64-bit x86 only**.
-2.  **Feature Macros**: Use the `HAS_` macros in `include/libyuv/row.h` to enable or disable specific AVX512 versions.
+1.  **AVX512 Logic**: AVX512 row functions are strictly enabled for **64-bit x86
+    only**.
+2.  **Feature Macros**: Use the `HAS_` macros in `include/libyuv/row.h` to
+    enable or disable specific AVX512 versions.
 
 ## Changelist (CL) & Commit Guidelines
 
-When generating descriptions, follow the Chromium/Google standard format. Wrap commit message text at 72 characters
+When generating descriptions, follow the Chromium/Google standard format. Wrap
+commit message text at 72 characters
 
 ### Format Example:
 
diff --git a/README.chromium b/README.chromium
index f97dcea59..e025cb9d6 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1946
+Version: 1947
 Revision: DEPS
 License: BSD-3-Clause
 License File: LICENSE
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 5aced2a2a..1ec86f5eb 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -23,10 +23,11 @@ extern "C" {
 #endif
 
 // This module is for Visual C 32/64 bit
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__) || \
-     defined(_M_X64) || defined(_M_X86))
-#if ((defined(_MSC_VER) && !defined(__clang__)) || defined(LIBYUV_ENABLE_ROWWIN))
+#if !defined(LIBYUV_DISABLE_X86) &&                                 \
+    (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || \
+     defined(_M_X86))
+#if ((defined(_MSC_VER) && !defined(__clang__)) || \
+     defined(LIBYUV_ENABLE_ROWWIN))
 #define USE_ROW_WIN
 #else
 #define USE_ROW_GCC
@@ -121,9 +122,9 @@ extern "C" {
 
 // The following are available on all x86 platforms, but
 // require VS2012, clang 3.4 or gcc 4.7.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__) || \
-     defined(_M_X64) || defined(_M_X86))
+#if !defined(LIBYUV_DISABLE_X86) &&                                 \
+    (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || \
+     defined(_M_X86))
 #define HAS_ARGBMIRRORROW_AVX2
 #define HAS_RGB24MIRRORROW_AVX2
 #define HAS_ARGBTOUVMATRIXROW_AVX2
@@ -139,7 +140,7 @@ extern "C" {
 #define HAS_INTERPOLATEROW_AVX2
 #endif
 
-#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
+#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) &&  \
     (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
      defined(GCC_HAS_AVX2))
 #define HAS_ARGBCOPYALPHAROW_AVX2
@@ -183,7 +184,7 @@ extern "C" {
 // The following are available for gcc/clang x86 platforms:
 // TODO(fbarchard): Port to Visual C
 #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
-    (defined(__x86_64__) || defined(__i386__)) && \
+    (defined(__x86_64__) || defined(__i386__)) &&           \
     !defined(LIBYUV_ENABLE_ROWWIN)
 #define HAS_AB64TOARGBROW_SSSE3
 #define HAS_ABGRTOAR30ROW_SSSE3
@@ -259,8 +260,8 @@ extern "C" {
 // The following are available for AVX2 gcc/clang x86 platforms:
 // TODO(fbarchard): Port to Visual C
 #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
-    (defined(__x86_64__) || defined(__i386__)) &&         \
-    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) && \
+    (defined(__x86_64__) || defined(__i386__)) &&           \
+    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) &&   \
     !defined(LIBYUV_ENABLE_ROWWIN)
 #define HAS_AB64TOARGBROW_AVX2
 #define HAS_ABGRTOAR30ROW_AVX2
@@ -342,19 +343,21 @@ extern "C" {
 #endif
 
 // This module is for Visual C 32/64 bit
-#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_WIN) && \
-    (defined(__x86_64__) || defined(__i386__) || \
-     defined(_M_X64) || defined(_M_X86)) && \
-    ((defined(_MSC_VER) && !defined(__clang__)) || \
+#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_WIN) &&         \
+    (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || \
+     defined(_M_X86)) &&                                            \
+    ((defined(_MSC_VER) && !defined(__clang__)) ||                  \
      defined(LIBYUV_ENABLE_ROWWIN))
 #define HAS_RAWTOARGBROW_AVX2
 #define HAS_RGB24TOARGBROW_AVX2
 #define HAS_RGB565TOARGBROW_AVX2
 #define HAS_ARGB1555TOARGBROW_AVX2
 #define HAS_ARGB4444TOARGBROW_AVX2
+#define HAS_ARGBSHUFFLEROW_AVX2
 #if defined(__x86_64__) || defined(_M_X64)
 #define HAS_RAWTOARGBROW_AVX512BW
 #define HAS_RGB24TOARGBROW_AVX512BW
+#define HAS_ARGBSHUFFLEROW_AVX512BW
 #endif
 #define HAS_ARGBTOYROW_AVX2
 #define HAS_ARGBTOYMATRIXROW_AVX2
@@ -383,7 +386,6 @@ extern "C" {
 #endif
 #define HAS_ARGBTORGB24ROW_AVX512VBMI
 #define HAS_CONVERT16TO8ROW_AVX512BW
-#define HAS_MERGEUVROW_AVX512BW
 #endif
 
 // The following are available for AVX512 clang x64 platforms:
@@ -401,6 +403,11 @@ extern "C" {
 #define HAS_ARGBTOUVJROW_AVX512BW
 #define HAS_ARGBTOUVMATRIXROW_AVX512BW
 #define HAS_J400TOARGBROW_AVX512BW
+#define HAS_MERGEUVROW_AVX512BW
+#define HAS_MIRRORROW_AVX512BW
+#define HAS_MIRRORSPLITUVROW_AVX512BW
+#define HAS_SPLITUVROW_AVX512BW
+#define HAS_RGBTOUVMATRIXROW_AVX512BW
 #endif
 
 // The following are available on Neon platforms:
@@ -1041,7 +1048,7 @@ struct ArgbConstants {
 
 #endif
 
-#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
 
 #define align_buffer_64(var, size)                                         \
   size_t var##_mem_size = (size); /* NOLINT */                             \
@@ -1097,26 +1104,17 @@ struct ArgbConstants {
 #define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B");
 
 #else /* Visual C */
-#define IACA_UD_BYTES \
-  { __asm _emit 0x0F __asm _emit 0x0B }
+#define IACA_UD_BYTES {__asm _emit 0x0F __asm _emit 0x0B}
 
 #define IACA_SSC_MARK(x) \
-  { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 }
+  {__asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90}
 
 #define IACA_VC64_START __writegsbyte(111, 111);
 #define IACA_VC64_END __writegsbyte(222, 222);
 #endif
 
-#define IACA_START     \
-  {                    \
-    IACA_UD_BYTES      \
-    IACA_SSC_MARK(111) \
-  }
-#define IACA_END       \
-  {                    \
-    IACA_SSC_MARK(222) \
-    IACA_UD_BYTES      \
-  }
+#define IACA_START {IACA_UD_BYTES IACA_SSC_MARK(111)}
+#define IACA_END {IACA_SSC_MARK(222) IACA_UD_BYTES}
 
 void I210AlphaToARGBRow_NEON(const uint16_t* src_y,
                              const uint16_t* src_u,
@@ -1828,9 +1826,9 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
                                int width,
                                const struct ArgbConstants* c);
 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
-                            uint8_t* dst_y,
-                            int width,
-                            const struct ArgbConstants* c);
+                           uint8_t* dst_y,
+                           int width,
+                           const struct ArgbConstants* c);
 void ARGBToUV444MatrixRow_Any_NEON(const uint8_t* src_argb,
                                    uint8_t* dst_u,
                                    uint8_t* dst_v,
@@ -2194,10 +2192,26 @@ void RGB565ToYMatrixRow_C(const uint8_t* src_rgb565,
                           uint8_t* dst_y,
                           int width,
                           const struct ArgbConstants* c);
-void ARGB1555ToYMatrixRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width, const struct ArgbConstants* c);
-void ARGB1555ToUVMatrixRow_C(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
-void ARGB4444ToYMatrixRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width, const struct ArgbConstants* c);
-void ARGB4444ToUVMatrixRow_C(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
+void ARGB1555ToYMatrixRow_C(const uint8_t* src_argb1555,
+                            uint8_t* dst_y,
+                            int width,
+                            const struct ArgbConstants* c);
+void ARGB1555ToUVMatrixRow_C(const uint8_t* src_argb1555,
+                             int src_stride_argb1555,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width,
+                             const struct ArgbConstants* c);
+void ARGB4444ToYMatrixRow_C(const uint8_t* src_argb4444,
+                            uint8_t* dst_y,
+                            int width,
+                            const struct ArgbConstants* c);
+void ARGB4444ToUVMatrixRow_C(const uint8_t* src_argb4444,
+                             int src_stride_argb4444,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width,
+                             const struct ArgbConstants* c);
 void RGB565ToUVMatrixRow_C(const uint8_t* src_rgb565,
                            int src_stride_rgb565,
                            uint8_t* dst_u,
@@ -2210,8 +2224,30 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
                              uint8_t* dst_v,
                              int width,
                              const struct ArgbConstants* c);
-void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
-void RGBToUVMatrixRow_Any_AVX2(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
+void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb,
+                           int src_stride_rgb,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width,
+                           const struct ArgbConstants* c);
+void RGBToUVMatrixRow_Any_AVX2(const uint8_t* src_rgb,
+                               int src_stride_rgb,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width,
+                               const struct ArgbConstants* c);
+void RGBToUVMatrixRow_AVX512BW(const uint8_t* src_rgb,
+                               int src_stride_rgb,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width,
+                               const struct ArgbConstants* c);
+void RGBToUVMatrixRow_Any_AVX512BW(const uint8_t* src_rgb,
+                                   int src_stride_rgb,
+                                   uint8_t* dst_u,
+                                   uint8_t* dst_v,
+                                   int width,
+                                   const struct ArgbConstants* c);
 void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
                             int src_stride_argb,
                             uint8_t* dst_u,
@@ -2301,18 +2337,66 @@ void RGB565ToUVMatrixRow_Any_AVX2(const uint8_t* src_rgb565,
                                   uint8_t* dst_v,
                                   int width,
                                   const struct ArgbConstants* c);
-void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width, const struct ArgbConstants* c);
-void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width, const struct ArgbConstants* c);
-void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
-void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width, const struct ArgbConstants* c);
-void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
-void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
-void RGB565ToYMatrixRow_Any_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width, const struct ArgbConstants* c);
-void ARGB1555ToYMatrixRow_Any_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width, const struct ArgbConstants* c);
-void ARGB1555ToUVMatrixRow_Any_NEON(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
-void ARGB4444ToYMatrixRow_Any_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width, const struct ArgbConstants* c);
-void ARGB4444ToUVMatrixRow_Any_NEON(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
-void RGB565ToUVMatrixRow_Any_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
+void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565,
+                             uint8_t* dst_y,
+                             int width,
+                             const struct ArgbConstants* c);
+void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555,
+                               uint8_t* dst_y,
+                               int width,
+                               const struct ArgbConstants* c);
+void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555,
+                                int src_stride_argb1555,
+                                uint8_t* dst_u,
+                                uint8_t* dst_v,
+                                int width,
+                                const struct ArgbConstants* c);
+void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444,
+                               uint8_t* dst_y,
+                               int width,
+                               const struct ArgbConstants* c);
+void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444,
+                                int src_stride_argb4444,
+                                uint8_t* dst_u,
+                                uint8_t* dst_v,
+                                int width,
+                                const struct ArgbConstants* c);
+void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565,
+                              int src_stride_rgb565,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width,
+                              const struct ArgbConstants* c);
+void RGB565ToYMatrixRow_Any_NEON(const uint8_t* src_rgb565,
+                                 uint8_t* dst_y,
+                                 int width,
+                                 const struct ArgbConstants* c);
+void ARGB1555ToYMatrixRow_Any_NEON(const uint8_t* src_argb1555,
+                                   uint8_t* dst_y,
+                                   int width,
+                                   const struct ArgbConstants* c);
+void ARGB1555ToUVMatrixRow_Any_NEON(const uint8_t* src_argb1555,
+                                    int src_stride_argb1555,
+                                    uint8_t* dst_u,
+                                    uint8_t* dst_v,
+                                    int width,
+                                    const struct ArgbConstants* c);
+void ARGB4444ToYMatrixRow_Any_NEON(const uint8_t* src_argb4444,
+                                   uint8_t* dst_y,
+                                   int width,
+                                   const struct ArgbConstants* c);
+void ARGB4444ToUVMatrixRow_Any_NEON(const uint8_t* src_argb4444,
+                                    int src_stride_argb4444,
+                                    uint8_t* dst_u,
+                                    uint8_t* dst_v,
+                                    int width,
+                                    const struct ArgbConstants* c);
+void RGB565ToUVMatrixRow_Any_NEON(const uint8_t* src_rgb565,
+                                  int src_stride_rgb565,
+                                  uint8_t* dst_u,
+                                  uint8_t* dst_v,
+                                  int width,
+                                  const struct ArgbConstants* c);
 
 void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
                            uint8_t* dst_y,
@@ -2340,9 +2424,22 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                           int width,
                           const struct ArgbConstants* c);
 
-void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
-void RGBToYMatrixRow_Any_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width, const struct ArgbConstants* c);
-void RGBToUVMatrixRow_Any_NEON(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c);
+void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb,
+                           int src_stride_rgb,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width,
+                           const struct ArgbConstants* c);
+void RGBToYMatrixRow_Any_NEON(const uint8_t* src_rgb,
+                              uint8_t* dst_y,
+                              int width,
+                              const struct ArgbConstants* c);
+void RGBToUVMatrixRow_Any_NEON(const uint8_t* src_rgb,
+                               int src_stride_rgb,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width,
+                               const struct ArgbConstants* c);
 
 void ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb,
                                    uint8_t* dst_y,
@@ -2374,7 +2471,6 @@ void ARGBToYMatrixRow_Any_LASX(const uint8_t* src_argb,
                                int width,
                                const struct ArgbConstants* c);
 
-
 void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
                                 uint8_t* dst_u,
                                 uint8_t* dst_v,
@@ -2432,15 +2528,29 @@ void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
 void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
-void ARGBToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ARGBToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ABGRToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void ABGRToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_AVX512BW(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToYJRow_Any_AVX512BW(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ABGRToYRow_Any_AVX512BW(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ABGRToYJRow_Any_AVX512BW(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
 void RGBAToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGBAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void RGBAToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_AVX512BW(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void RGBAToYJRow_Any_AVX512BW(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
 void BGRAToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void BGRAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_AVX512BW(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
 void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ABGRToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -3040,12 +3150,16 @@ void ARGBToUVJ444Row_C(const uint8_t* src_argb,
                        uint8_t* dst_v,
                        int width);
 
+void MirrorRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_Any_AVX512BW(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
@@ -3063,6 +3177,10 @@ void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 
+void MirrorSplitUVRow_AVX512BW(const uint8_t* src,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width);
 void MirrorSplitUVRow_AVX2(const uint8_t* src,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
@@ -3124,6 +3242,10 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
+void SplitUVRow_AVX512BW(const uint8_t* src_uv,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
 void SplitUVRow_AVX2(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
@@ -3140,6 +3262,10 @@ void SplitUVRow_RVV(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
+void SplitUVRow_Any_AVX512BW(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
 void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -4160,8 +4286,12 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
                           int width);
 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width);
-void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
-void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width);
+void RAWToARGBRow_AVX512BW(const uint8_t* src_raw,
+                           uint8_t* dst_argb,
+                           int width);
 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 
@@ -4250,9 +4380,7 @@ void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
 void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
-void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr,
-                           uint8_t* dst_ptr,
-                           int width);
+void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void RGB24ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
@@ -4272,7 +4400,6 @@ void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int width);
 
-
 void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int width);
diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h
index f7e2123a7..280d635b9 100644
--- a/include/libyuv/row_sve.h
+++ b/include/libyuv/row_sve.h
@@ -631,8 +631,8 @@ static inline void I422ToRGB565Row_SVE_SC(
       // Calculate a predicate for the final iteration to deal with the tail.
       "cnth     %[vl]                                   \n"
       "whilelt  p1.b, wzr, %w[width]                    \n"  //
-      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
-          RGB8TORGB565_SVE_FROM_TOP_2X
+      READYUV422_SVE_2X I422TORGB_SVE_2X
+          RGBTOARGB8_SVE_TOP_2X RGB8TORGB565_SVE_FROM_TOP_2X
       // Need to permute the data on the final iteration such that the
       // predicates (.b) line up with the 16-bit element data.
       "trn1     z20.b, z18.b, z19.b                     \n"
@@ -694,8 +694,8 @@ static inline void I422ToARGB1555Row_SVE_SC(
       // Calculate a predicate for the final iteration to deal with the tail.
       "cnth     %[vl]                                   \n"
       "whilelt  p1.b, wzr, %w[width]                    \n"  //
-      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
-          RGB8TOARGB1555_SVE_FROM_TOP_2X
+      READYUV422_SVE_2X I422TORGB_SVE_2X
+          RGBTOARGB8_SVE_TOP_2X RGB8TOARGB1555_SVE_FROM_TOP_2X
       "st2h     {z0.h, z1.h}, p1, [%[dst]] \n"
 
       "99:                                              \n"
@@ -753,8 +753,8 @@ static inline void I422ToARGB4444Row_SVE_SC(
       // Calculate a predicate for the final iteration to deal with the tail.
       "cnth     %[vl]                                   \n"
       "whilelt  p1.b, wzr, %w[width]                    \n"  //
-      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
-          RGB8TOARGB4444_SVE_FROM_TOP_2X
+      READYUV422_SVE_2X I422TORGB_SVE_2X
+          RGBTOARGB8_SVE_TOP_2X RGB8TOARGB4444_SVE_FROM_TOP_2X
       "st2h     {z0.h, z1.h}, p1, [%[dst]] \n"
 
       "99:                                              \n"
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index d90f894f7..b12b94978 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1946
+#define LIBYUV_VERSION 1947
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc
index 756f83cb3..36c5e575c 100644
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@@ -116,7 +116,7 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
   uint32_t hash = seed;
   const uint32_t c16 = 0x92d9e201;  // 33^16
   uint32_t tmp, tmp2;
-      asm("ld1         {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
+  asm("ld1         {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
       "ld1         {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
 
       // count is always a multiple of 16.
diff --git a/source/compare_win.cc b/source/compare_win.cc
index 9d5bb27cd..59374cd8a 100644
--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -41,8 +41,9 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
   return diff;
 }
 
-__declspec(naked) uint32_t
-    SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
+__declspec(naked) uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+                                               const uint8_t* src_b,
+                                               int count) {
   __asm {
     mov        eax, [esp + 4]  // src_a
     mov        edx, [esp + 8]  // src_b
@@ -81,8 +82,9 @@ __declspec(naked) uint32_t
 #ifdef HAS_SUMSQUAREERROR_AVX2
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
 #pragma warning(disable : 4752)
-__declspec(naked) uint32_t
-    SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
+__declspec(naked) uint32_t SumSquareError_AVX2(const uint8_t* src_a,
+                                               const uint8_t* src_b,
+                                               int count) {
   __asm {
     mov        eax, [esp + 4]  // src_a
     mov        edx, [esp + 8]  // src_b
@@ -146,8 +148,9 @@ uvec32 kHashMul3 = {
     0x00000001,  // 33 ^ 0
 };
 
-__declspec(naked) uint32_t
-    HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
+__declspec(naked) uint32_t HashDjb2_SSE41(const uint8_t* src,
+                                          int count,
+                                          uint32_t seed) {
   __asm {
     mov        eax, [esp + 4]  // src
     mov        ecx, [esp + 8]  // count
@@ -197,8 +200,9 @@ __declspec(naked) uint32_t
 
 // Visual C 2012 required for AVX2.
 #ifdef HAS_HASHDJB2_AVX2
-__declspec(naked) uint32_t
-    HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
+__declspec(naked) uint32_t HashDjb2_AVX2(const uint8_t* src,
+                                         int count,
+                                         uint32_t seed) {
   __asm {
     mov        eax, [esp + 4]  // src
     mov        ecx, [esp + 8]  // count
diff --git a/source/convert.cc b/source/convert.cc
index c5e4be418..fbef68f57 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -13,12 +13,11 @@
 #include <limits.h>
 
 #include "libyuv/basic_types.h"
+#include "libyuv/convert_from_argb.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
-#include "libyuv/convert_from_argb.h"
 #include "libyuv/rotate.h"
 #include "libyuv/row.h"
-
 #include "libyuv/scale.h"      // For ScalePlane()
 #include "libyuv/scale_row.h"  // For FixedDiv
 #include "libyuv/scale_uv.h"   // For UVScale()
@@ -2034,8 +2033,8 @@ int ARGBToI420(const uint8_t* src_argb,
                int width,
                int height) {
   return ARGBToI420Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kArgbI601Constants,
-                          width, height);
+                          dst_stride_u, dst_v, dst_stride_v,
+                          &kArgbI601Constants, width, height);
 }
 
 LIBYUV_API
@@ -2056,7 +2055,7 @@ int ARGBToI420Matrix(const uint8_t* src_argb,
   void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                             uint8_t* dst_u, uint8_t* dst_v, int width,
                             const struct ArgbConstants* c) =
-ARGBToUVMatrixRow_C;
+      ARGBToUVMatrixRow_C;
 
 #if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -2121,34 +2120,34 @@ ARGBToUVMatrixRow_C;
 #endif
 
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
-      }
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
-      }
+  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-    if (TestCpuFlag(kCpuHasSVE2)) {
-      if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
-      }
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SME)
-    if (TestCpuFlag(kCpuHasSME)) {
-      if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
-      }
+  if (TestCpuFlag(kCpuHasSME)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -2439,8 +2438,8 @@ int BGRAToI420(const uint8_t* src_bgra,
                int width,
                int height) {
   return ARGBToI420Matrix(src_bgra, src_stride_bgra, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kBgraI601Constants,
-                          width, height);
+                          dst_stride_u, dst_v, dst_stride_v,
+                          &kBgraI601Constants, width, height);
 }
 
 // Convert BGRA to I422.
@@ -2456,8 +2455,8 @@ int BGRAToI422(const uint8_t* src_bgra,
                int width,
                int height) {
   return ARGBToI422Matrix(src_bgra, src_stride_bgra, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kBgraI601Constants,
-                          width, height);
+                          dst_stride_u, dst_v, dst_stride_v,
+                          &kBgraI601Constants, width, height);
 }
 
 // Convert ABGR to I422.
@@ -2473,8 +2472,8 @@ int ABGRToI422(const uint8_t* src_abgr,
                int width,
                int height) {
   return ARGBToI422Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kAbgrI601Constants,
-                          width, height);
+                          dst_stride_u, dst_v, dst_stride_v,
+                          &kAbgrI601Constants, width, height);
 }
 
 // Convert RGBA to I422.
@@ -2490,8 +2489,8 @@ int RGBAToI422(const uint8_t* src_rgba,
                int width,
                int height) {
   return ARGBToI422Matrix(src_rgba, src_stride_rgba, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kRgbaI601Constants,
-                          width, height);
+                          dst_stride_u, dst_v, dst_stride_v,
+                          &kRgbaI601Constants, width, height);
 }
 
 // Convert ABGR to I420.
@@ -2507,8 +2506,8 @@ int ABGRToI420(const uint8_t* src_abgr,
                int width,
                int height) {
   return ARGBToI420Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kAbgrI601Constants,
-                          width, height);
+                          dst_stride_u, dst_v, dst_stride_v,
+                          &kAbgrI601Constants, width, height);
 }
 
 // Convert RGBA to I420.
@@ -2524,8 +2523,8 @@ int RGBAToI420(const uint8_t* src_rgba,
                int width,
                int height) {
   return ARGBToI420Matrix(src_rgba, src_stride_rgba, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kRgbaI601Constants,
-                          width, height);
+                          dst_stride_u, dst_v, dst_stride_v,
+                          &kRgbaI601Constants, width, height);
 }
 
 // Enabled if 1 pass is available
@@ -2569,6 +2568,14 @@ int RGB24ToI420(const uint8_t* src_rgb24,
     }
   }
 #endif
+#if defined(HAS_RGBTOUVMATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    RGBToUVMatrixRow = RGBToUVMatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      RGBToUVMatrixRow = RGBToUVMatrixRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_RGBTOUVMATRIXROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     RGBToUVMatrixRow = RGBToUVMatrixRow_Any_NEON;
@@ -2603,9 +2610,11 @@ int RGB24ToI420(const uint8_t* src_rgb24,
   }
 
   for (y = 0; y < height - 1; y += 2) {
-    RGBToUVMatrixRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width, &kArgbI601Constants);
+    RGBToUVMatrixRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width,
+                     &kArgbI601Constants);
     RGBToYMatrixRow(src_rgb24, dst_y, width, &kArgbI601Constants);
-    RGBToYMatrixRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width, &kArgbI601Constants);
+    RGBToYMatrixRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width,
+                    &kArgbI601Constants);
     src_rgb24 += src_stride_rgb24 * 2;
     dst_y += dst_stride_y * 2;
     dst_u += dst_stride_u;
@@ -2854,15 +2863,15 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
 // Convert RAW to I420.
 LIBYUV_API
 int RAWToI420(const uint8_t* src_rgb24,
-                int src_stride_rgb24,
-                uint8_t* dst_y,
-                int dst_stride_y,
-                uint8_t* dst_u,
-                int dst_stride_u,
-                uint8_t* dst_v,
-                int dst_stride_v,
-                int width,
-                int height) {
+              int src_stride_rgb24,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
   int y;
   void (*RGBToUVMatrixRow)(const uint8_t* src_rgb, int src_stride_rgb,
                            uint8_t* dst_u, uint8_t* dst_v, int width,
@@ -2886,6 +2895,14 @@ int RAWToI420(const uint8_t* src_rgb24,
     }
   }
 #endif
+#if defined(HAS_RGBTOUVMATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    RGBToUVMatrixRow = RGBToUVMatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      RGBToUVMatrixRow = RGBToUVMatrixRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_RGBTOUVMATRIXROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     RGBToUVMatrixRow = RGBToUVMatrixRow_Any_NEON;
@@ -2920,9 +2937,11 @@ int RAWToI420(const uint8_t* src_rgb24,
   }
 
   for (y = 0; y < height - 1; y += 2) {
-    RGBToUVMatrixRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width, &kArgbI601Constants);
+    RGBToUVMatrixRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width,
+                     &kArgbI601Constants);
     RGBToYMatrixRow(src_rgb24, dst_y, width, &kArgbI601Constants);
-    RGBToYMatrixRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width, &kArgbI601Constants);
+    RGBToYMatrixRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width,
+                    &kArgbI601Constants);
     src_rgb24 += src_stride_rgb24 * 2;
     dst_y += dst_stride_y * 2;
     dst_u += dst_stride_u;
@@ -3622,9 +3641,11 @@ int RGB565ToI420(const uint8_t* src_rgb565,
   int y;
   void (*RGB565ToUVMatrixRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
                               uint8_t* dst_u, uint8_t* dst_v, int width,
-                              const struct ArgbConstants* c) = RGB565ToUVMatrixRow_C;
-  void (*RGB565ToYMatrixRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width,
-                             const struct ArgbConstants* c) = RGB565ToYMatrixRow_C;
+                              const struct ArgbConstants* c) =
+      RGB565ToUVMatrixRow_C;
+  void (*RGB565ToYMatrixRow)(const uint8_t* src_rgb565, uint8_t* dst_y,
+                             int width, const struct ArgbConstants* c) =
+      RGB565ToYMatrixRow_C;
 
 #if defined(HAS_RGB565TOYMATRIXROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
@@ -3671,9 +3692,11 @@ int RGB565ToI420(const uint8_t* src_rgb565,
   }
 
   for (y = 0; y < height - 1; y += 2) {
-    RGB565ToUVMatrixRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width, &kArgbI601Constants);
+    RGB565ToUVMatrixRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width,
+                        &kArgbI601Constants);
     RGB565ToYMatrixRow(src_rgb565, dst_y, width, &kArgbI601Constants);
-    RGB565ToYMatrixRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width, &kArgbI601Constants);
+    RGB565ToYMatrixRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y,
+                       width, &kArgbI601Constants);
     src_rgb565 += src_stride_rgb565 * 2;
     dst_y += dst_stride_y * 2;
     dst_u += dst_stride_u;
@@ -3681,30 +3704,31 @@ int RGB565ToI420(const uint8_t* src_rgb565,
   }
   if (height & 1) {
     RGB565ToYMatrixRow(src_rgb565, dst_y, width, &kArgbI601Constants);
-    RGB565ToUVMatrixRow(src_rgb565, 0, dst_u, dst_v, width, &kArgbI601Constants);
+    RGB565ToUVMatrixRow(src_rgb565, 0, dst_u, dst_v, width,
+                        &kArgbI601Constants);
   }
   return 0;
 }
 // Convert ARGB1555 to I420.
 LIBYUV_API
 int ARGB1555ToI420(const uint8_t* src_argb1555,
-                 int src_stride_argb1555,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 uint8_t* dst_u,
-                 int dst_stride_u,
-                 uint8_t* dst_v,
-                 int dst_stride_v,
-                 int width,
-                 int height) {
+                   int src_stride_argb1555,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height) {
   int y;
   void (*ARGB1555ToUVMatrixRow)(
       const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u,
-      uint8_t* dst_v, int width,
-      const struct ArgbConstants* c) = ARGB1555ToUVMatrixRow_C;
-  void (*ARGB1555ToYMatrixRow)(
-      const uint8_t* src_argb1555, uint8_t* dst_y, int width,
-      const struct ArgbConstants* c) = ARGB1555ToYMatrixRow_C;
+      uint8_t* dst_v, int width, const struct ArgbConstants* c) =
+      ARGB1555ToUVMatrixRow_C;
+  void (*ARGB1555ToYMatrixRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
+                               int width, const struct ArgbConstants* c) =
+      ARGB1555ToYMatrixRow_C;
 
 #if defined(HAS_ARGB1555TOYMATRIXROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
@@ -3751,9 +3775,11 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
   }
 
   for (y = 0; y < height - 1; y += 2) {
-    ARGB1555ToUVMatrixRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width, &kArgbI601Constants);
+    ARGB1555ToUVMatrixRow(src_argb1555, src_stride_argb1555, dst_u, dst_v,
+                          width, &kArgbI601Constants);
     ARGB1555ToYMatrixRow(src_argb1555, dst_y, width, &kArgbI601Constants);
-    ARGB1555ToYMatrixRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, width, &kArgbI601Constants);
+    ARGB1555ToYMatrixRow(src_argb1555 + src_stride_argb1555,
+                         dst_y + dst_stride_y, width, &kArgbI601Constants);
     src_argb1555 += src_stride_argb1555 * 2;
     dst_y += dst_stride_y * 2;
     dst_u += dst_stride_u;
@@ -3761,30 +3787,31 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
   }
   if (height & 1) {
     ARGB1555ToYMatrixRow(src_argb1555, dst_y, width, &kArgbI601Constants);
-    ARGB1555ToUVMatrixRow(src_argb1555, 0, dst_u, dst_v, width, &kArgbI601Constants);
+    ARGB1555ToUVMatrixRow(src_argb1555, 0, dst_u, dst_v, width,
+                          &kArgbI601Constants);
   }
   return 0;
 }
 // Convert ARGB4444 to I420.
 LIBYUV_API
 int ARGB4444ToI420(const uint8_t* src_argb4444,
-                 int src_stride_argb4444,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 uint8_t* dst_u,
-                 int dst_stride_u,
-                 uint8_t* dst_v,
-                 int dst_stride_v,
-                 int width,
-                 int height) {
+                   int src_stride_argb4444,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height) {
   int y;
   void (*ARGB4444ToUVMatrixRow)(
       const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u,
-      uint8_t* dst_v, int width,
-      const struct ArgbConstants* c) = ARGB4444ToUVMatrixRow_C;
-  void (*ARGB4444ToYMatrixRow)(
-      const uint8_t* src_argb4444, uint8_t* dst_y, int width,
-      const struct ArgbConstants* c) = ARGB4444ToYMatrixRow_C;
+      uint8_t* dst_v, int width, const struct ArgbConstants* c) =
+      ARGB4444ToUVMatrixRow_C;
+  void (*ARGB4444ToYMatrixRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
+                               int width, const struct ArgbConstants* c) =
+      ARGB4444ToYMatrixRow_C;
 
 #if defined(HAS_ARGB4444TOYMATRIXROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
@@ -3831,9 +3858,11 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
   }
 
   for (y = 0; y < height - 1; y += 2) {
-    ARGB4444ToUVMatrixRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width, &kArgbI601Constants);
+    ARGB4444ToUVMatrixRow(src_argb4444, src_stride_argb4444, dst_u, dst_v,
+                          width, &kArgbI601Constants);
     ARGB4444ToYMatrixRow(src_argb4444, dst_y, width, &kArgbI601Constants);
-    ARGB4444ToYMatrixRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, width, &kArgbI601Constants);
+    ARGB4444ToYMatrixRow(src_argb4444 + src_stride_argb4444,
+                         dst_y + dst_stride_y, width, &kArgbI601Constants);
     src_argb4444 += src_stride_argb4444 * 2;
     dst_y += dst_stride_y * 2;
     dst_u += dst_stride_u;
@@ -3841,7 +3870,8 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
   }
   if (height & 1) {
     ARGB4444ToYMatrixRow(src_argb4444, dst_y, width, &kArgbI601Constants);
-    ARGB4444ToUVMatrixRow(src_argb4444, 0, dst_u, dst_v, width, &kArgbI601Constants);
+    ARGB4444ToUVMatrixRow(src_argb4444, 0, dst_u, dst_v, width,
+                          &kArgbI601Constants);
   }
   return 0;
 }
@@ -3993,7 +4023,7 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
     RGB24ToARGBRow = RGB24ToARGBRow_RVV;
   }
 #endif
-{
+  {
     // Allocate 1 row of ARGB.
     const int row_size = (width * 4 + 31) & ~31;
     align_buffer_64(row, row_size);
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 2df97d079..a0b9c5d37 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -3720,7 +3720,7 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
     RGB24ToARGBRow = RGB24ToARGBRow_RVV;
   }
 #endif
-for (y = 0; y < height; ++y) {
+  for (y = 0; y < height; ++y) {
     RGB24ToARGBRow(src_rgb24, dst_argb, width);
     src_rgb24 += src_stride_rgb24;
     dst_argb += dst_stride_argb;
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index d912f4537..77b3851d4 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -35,8 +35,8 @@ int ARGBToI444(const uint8_t* src_argb,
                int width,
                int height) {
   return ARGBToI444Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kArgbI601Constants,
-                          width, height);
+                          dst_stride_u, dst_v, dst_stride_v,
+                          &kArgbI601Constants, width, height);
 }
 
 LIBYUV_API
@@ -54,10 +54,9 @@ int ARGBToI444Matrix(const uint8_t* src_argb,
   int y;
   void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
                            const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
-  void (*ARGBToUV444MatrixRow)(const uint8_t* src_argb, uint8_t* dst_u,
-                               uint8_t* dst_v, int width,
-                               const struct ArgbConstants* c) =
-ARGBToUV444MatrixRow_C;
+  void (*ARGBToUV444MatrixRow)(
+      const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width,
+      const struct ArgbConstants* c) = ARGBToUV444MatrixRow_C;
 
 #if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -188,8 +187,8 @@ int ARGBToI422(const uint8_t* src_argb,
                int width,
                int height) {
   return ARGBToI422Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kArgbI601Constants,
-                          width, height);
+                          dst_stride_u, dst_v, dst_stride_v,
+                          &kArgbI601Constants, width, height);
 }
 
 LIBYUV_API
@@ -210,7 +209,7 @@ int ARGBToI422Matrix(const uint8_t* src_argb,
   void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                             uint8_t* dst_u, uint8_t* dst_v, int width,
                             const struct ArgbConstants* c) =
-ARGBToUVMatrixRow_C;
+      ARGBToUVMatrixRow_C;
 
 #if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -275,34 +274,34 @@ ARGBToUVMatrixRow_C;
 #endif
 
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
-      }
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
-      }
+  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-    if (TestCpuFlag(kCpuHasSVE2)) {
-      if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
-      }
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SME)
-    if (TestCpuFlag(kCpuHasSME)) {
-      if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
-      }
+  if (TestCpuFlag(kCpuHasSME)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -359,8 +358,9 @@ int ARGBToNV12(const uint8_t* src_argb,
                int dst_stride_uv,
                int width,
                int height) {
-  return ARGBToNV12Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_uv,
-                          dst_stride_uv, &kArgbI601Constants, width, height);
+  return ARGBToNV12Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y,
+                          dst_uv, dst_stride_uv, &kArgbI601Constants, width,
+                          height);
 }
 
 LIBYUV_API
@@ -380,7 +380,7 @@ int ARGBToNV12Matrix(const uint8_t* src_argb,
   void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                             uint8_t* dst_u, uint8_t* dst_v, int width,
                             const struct ArgbConstants* c) =
-ARGBToUVMatrixRow_C;
+      ARGBToUVMatrixRow_C;
 
 #if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -445,34 +445,34 @@ ARGBToUVMatrixRow_C;
 #endif
 
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
-      }
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
-      }
+  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-    if (TestCpuFlag(kCpuHasSVE2)) {
-      if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
-      }
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SME)
-    if (TestCpuFlag(kCpuHasSME)) {
-      if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
-      }
+  if (TestCpuFlag(kCpuHasSME)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -565,7 +565,7 @@ ARGBToUVMatrixRow_C;
     MergeUVRow(row_u, row_v, dst_uv, halfwidth);
     ARGBToYMatrixRow(src_argb, dst_y, width, argbconstants);
     ARGBToYMatrixRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width,
-                      argbconstants);
+                     argbconstants);
     src_argb += src_stride_argb * 2;
     dst_y += dst_stride_y * 2;
     dst_uv += dst_stride_uv;
@@ -595,7 +595,7 @@ int ARGBToNV21Matrix(const uint8_t* src_argb,
   void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                             uint8_t* dst_u, uint8_t* dst_v, int width,
                             const struct ArgbConstants* c) =
-ARGBToUVMatrixRow_C;
+      ARGBToUVMatrixRow_C;
 
 #if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -660,34 +660,34 @@ ARGBToUVMatrixRow_C;
 #endif
 
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
-      }
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
-      }
+  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-    if (TestCpuFlag(kCpuHasSVE2)) {
-      if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
-      }
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SME)
-    if (TestCpuFlag(kCpuHasSME)) {
-      if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
-      }
+  if (TestCpuFlag(kCpuHasSME)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -780,7 +780,7 @@ ARGBToUVMatrixRow_C;
     MergeUVRow(row_u, row_v, dst_vu, halfwidth);
     ARGBToYMatrixRow(src_argb, dst_y, width, argbconstants);
     ARGBToYMatrixRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width,
-                      argbconstants);
+                     argbconstants);
     src_argb += src_stride_argb * 2;
     dst_y += dst_stride_y * 2;
     dst_vu += dst_stride_uv;
@@ -864,7 +864,8 @@ int ARGBToYUY2Matrix(const uint8_t* src_argb,
   int y;
   void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                             uint8_t* dst_u, uint8_t* dst_v, int width,
-                            const struct ArgbConstants* c) = ARGBToUVMatrixRow_C;
+                            const struct ArgbConstants* c) =
+      ARGBToUVMatrixRow_C;
   void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
                            const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
   void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
@@ -976,7 +977,8 @@ int ARGBToUYVYMatrix(const uint8_t* src_argb,
   int y;
   void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
                             uint8_t* dst_u, uint8_t* dst_v, int width,
-                            const struct ArgbConstants* c) = ARGBToUVMatrixRow_C;
+                            const struct ArgbConstants* c) =
+      ARGBToUVMatrixRow_C;
   void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
                            const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
   void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
@@ -1077,8 +1079,6 @@ int ARGBToUYVYMatrix(const uint8_t* src_argb,
   return 0;
 }
 
-
-
 // Same as NV12 but U and V swapped.
 LIBYUV_API
 int ARGBToNV21(const uint8_t* src_argb,
@@ -1089,8 +1089,9 @@ int ARGBToNV21(const uint8_t* src_argb,
                int dst_stride_vu,
                int width,
                int height) {
-  return ARGBToNV21Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_vu,
-                          dst_stride_vu, &kArgbI601Constants, width, height);
+  return ARGBToNV21Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y,
+                          dst_vu, dst_stride_vu, &kArgbI601Constants, width,
+                          height);
 }
 
 LIBYUV_API
@@ -1102,8 +1103,9 @@ int ABGRToNV12(const uint8_t* src_abgr,
                int dst_stride_uv,
                int width,
                int height) {
-  return ARGBToNV12Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_uv,
-                          dst_stride_uv, &kAbgrI601Constants, width, height);
+  return ARGBToNV12Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y,
+                          dst_uv, dst_stride_uv, &kAbgrI601Constants, width,
+                          height);
 }
 
 // Same as NV12 but U and V swapped.
@@ -1116,8 +1118,9 @@ int ABGRToNV21(const uint8_t* src_abgr,
                int dst_stride_vu,
                int width,
                int height) {
-  return ARGBToNV21Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_vu,
-                          dst_stride_vu, &kAbgrI601Constants, width, height);
+  return ARGBToNV21Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y,
+                          dst_vu, dst_stride_vu, &kAbgrI601Constants, width,
+                          height);
 }
 
 // Convert ARGB to YUY2.
@@ -1819,8 +1822,8 @@ int ARGBToJ444(const uint8_t* src_argb,
                int width,
                int height) {
   return ARGBToI444Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kArgbJPEGConstants,
-                          width, height);
+                          dst_stride_u, dst_v, dst_stride_v,
+                          &kArgbJPEGConstants, width, height);
 }
 
 // Convert ARGB to J420. (JPeg full range I420).
@@ -1836,8 +1839,8 @@ int ARGBToJ420(const uint8_t* src_argb,
                int width,
                int height) {
   return ARGBToI420Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kArgbJPEGConstants,
-                          width, height);
+                          dst_stride_u, dst_v, dst_stride_v,
+                          &kArgbJPEGConstants, width, height);
 }
 
 // Convert ARGB to J422. (JPeg full range I422).
@@ -1853,8 +1856,8 @@ int ARGBToJ422(const uint8_t* src_argb,
                int width,
                int height) {
   return ARGBToI422Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kArgbJPEGConstants,
-                          width, height);
+                          dst_stride_u, dst_v, dst_stride_v,
+                          &kArgbJPEGConstants, width, height);
 }
 
 // Convert ARGB to J400.
@@ -1978,8 +1981,8 @@ int ABGRToJ420(const uint8_t* src_abgr,
                int width,
                int height) {
   return ARGBToI420Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kAbgrJPEGConstants,
-                          width, height);
+                          dst_stride_u, dst_v, dst_stride_v,
+                          &kAbgrJPEGConstants, width, height);
 }
 
 // Convert ABGR to J422. (JPeg full range I422).
@@ -1995,8 +1998,8 @@ int ABGRToJ422(const uint8_t* src_abgr,
                int width,
                int height) {
   return ARGBToI422Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u,
-                          dst_stride_u, dst_v, dst_stride_v, &kAbgrJPEGConstants,
-                          width, height);
+                          dst_stride_u, dst_v, dst_stride_v,
+                          &kAbgrJPEGConstants, width, height);
 }
 
 // Convert ABGR to J400.
@@ -2165,7 +2168,7 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
   void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
                            const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
   void (*MergeUVRow)(const uint8_t* src_uj, const uint8_t* src_vj,
-                      uint8_t* dst_vu, int width) = MergeUVRow_C;
+                     uint8_t* dst_vu, int width) = MergeUVRow_C;
 #if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
@@ -2298,34 +2301,34 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
   }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
-      }
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
-    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
-      }
+  if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
-    if (TestCpuFlag(kCpuHasSVE2)) {
-      if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
-      }
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SME)
-    if (TestCpuFlag(kCpuHasSME)) {
-      if (IS_ALIGNED(width, 2)) {
-        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
-      }
+  if (TestCpuFlag(kCpuHasSME)) {
+    if (IS_ALIGNED(width, 2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
     }
+  }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -2424,7 +2427,8 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
       ARGBToUVMatrixRow(row, row_size, row_u, row_v, width, argbconstants);
       MergeUVRow(row_v, row_u, dst_vu, halfwidth);
       ARGBToYMatrixRow(row, dst_y, width, argbconstants);
-      ARGBToYMatrixRow(row + row_size, dst_y + dst_stride_y, width, argbconstants);
+      ARGBToYMatrixRow(row + row_size, dst_y + dst_stride_y, width,
+                       argbconstants);
       src_raw += src_stride_raw * 2;
       dst_y += dst_stride_y * 2;
       dst_vu += dst_stride_vu;
@@ -2482,7 +2486,6 @@ int RGB24ToNV12(const uint8_t* src_rgb24,
                          height);
 }
 
-
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index cff7c5d0a..3481d643d 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -8,13 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/convert_from_argb.h"  // For ArgbConstants
 #include "libyuv/planar_functions.h"
 
 #include <assert.h>
 #include <limits.h>
 #include <string.h>  // for memset()
 
+#include "libyuv/convert_from_argb.h"  // For ArgbConstants
 #include "libyuv/cpu_id.h"
 #include "libyuv/row.h"
 #include "libyuv/scale_row.h"  // for ScaleRowDown2
@@ -630,6 +630,14 @@ void SplitUVPlane(const uint8_t* src_uv,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    SplitUVRow = SplitUVRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      SplitUVRow = SplitUVRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_SPLITUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     SplitUVRow = SplitUVRow_Any_NEON;
@@ -1087,7 +1095,7 @@ int NV21ToNV12(const uint8_t* src_y,
 }
 
 // Test if tile_height is a power of 2 (16 or 32)
-#define IS_POWEROFTWO(x) (!((x) & ((x)-1)))
+#define IS_POWEROFTWO(x) (!((x) & ((x) - 1)))
 
 // Detile a plane of data
 // tile width is 16 and assumed.
@@ -2588,6 +2596,14 @@ void MirrorPlane(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_MIRRORROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    MirrorRow = MirrorRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_MIRRORROW_LSX)
   if (TestCpuFlag(kCpuHasLSX)) {
     MirrorRow = MirrorRow_Any_LSX;
diff --git a/source/rotate.cc b/source/rotate.cc
index 54e0c2e63..60940f51f 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -8,11 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "libyuv/rotate.h"
+
 #include <assert.h>
 #include <limits.h>
 
-#include "libyuv/rotate.h"
-
 #include "libyuv/convert.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
@@ -403,6 +403,11 @@ void SplitRotateUV180(const uint8_t* src,
     MirrorSplitUVRow = MirrorSplitUVRow_AVX2;
   }
 #endif
+#if defined(HAS_MIRRORSPLITUVROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW) && IS_ALIGNED(width, 32)) {
+    MirrorSplitUVRow = MirrorSplitUVRow_AVX512BW;
+  }
+#endif
 #if defined(HAS_MIRRORSPLITUVROW_LSX)
   if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 32)) {
     MirrorSplitUVRow = MirrorSplitUVRow_LSX;
diff --git a/source/rotate_win.cc b/source/rotate_win.cc
index 03eeee3a6..5b40f62a0 100644
--- a/source/rotate_win.cc
+++ b/source/rotate_win.cc
@@ -64,7 +64,7 @@ __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
     mov       eax, ebp
     movdqa    xmm7, xmm6
     palignr   xmm7, xmm7, 8
-    // Second round of bit swap.
+     // Second round of bit swap.
     punpcklwd xmm0, xmm2
     punpcklwd xmm1, xmm3
     movdqa    xmm2, xmm0
@@ -77,8 +77,8 @@ __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
     movdqa    xmm7, xmm5
     palignr   xmm6, xmm6, 8
     palignr   xmm7, xmm7, 8
-    // Third round of bit swap.
-    // Write to the destination pointer.
+     // Third round of bit swap.
+     // Write to the destination pointer.
     punpckldq xmm0, xmm4
     movq      qword ptr [edx], xmm0
     movdqa    xmm4, xmm0
@@ -173,7 +173,7 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
     movdqa    xmm7, xmm5
     lea       eax, [eax + 8 * edi + 16]
     neg       edi
-        // Second round of bit swap.
+         // Second round of bit swap.
     movdqa    xmm5, xmm0
     punpcklwd xmm0, xmm2
     punpckhwd xmm5, xmm2
@@ -193,8 +193,8 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
     punpckhwd xmm6, xmm7
     movdqa    xmm7, xmm6
 
-        // Third round of bit swap.
-        // Write to the destination pointer.
+         // Third round of bit swap.
+         // Write to the destination pointer.
     movdqa    xmm6, xmm0
     punpckldq xmm0, xmm4
     punpckhdq xmm6, xmm4
diff --git a/source/row_any.cc b/source/row_any.cc
index cac6339d1..340adc188 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -1919,6 +1919,9 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2,
     memcpy(dst_ptr + np * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \
   }
 
+#ifdef HAS_MIRRORROW_AVX512BW
+ANY11M(MirrorRow_Any_AVX512BW, MirrorRow_AVX512BW, 1, 63)
+#endif
 #ifdef HAS_MIRRORROW_AVX2
 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
 #endif
@@ -2022,6 +2025,9 @@ ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3)
 #ifdef HAS_SPLITUVROW_SSE2
 ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
 #endif
+#ifdef HAS_SPLITUVROW_AVX512BW
+ANY12(SplitUVRow_Any_AVX512BW, SplitUVRow_AVX512BW, 0, 2, 0, 63)
+#endif
 #ifdef HAS_SPLITUVROW_AVX2
 ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
 #endif
@@ -2193,7 +2199,7 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
                uint8_t* dst_v, int width) {                                  \
     SIMD_ALIGNED(uint8_t vin[256 * 2]);                                      \
     SIMD_ALIGNED(uint8_t vout[256 * 2]);                                     \
-    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    memset(vin, 0, sizeof(vin));   /* for msan */                            \
     memset(vout, 0, sizeof(vout)); /* for msan */                            \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
@@ -2215,29 +2221,29 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
     memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1));                         \
   }
 
-#define ANY12M(NAMEANY, ANY_SIMD, BPP, MASK)                                 \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v,        \
-               int width, const struct ArgbConstants* c) {                   \
-    SIMD_ALIGNED(uint8_t vin[256]);                                          \
-    SIMD_ALIGNED(uint8_t vout[256 * 2]);                                     \
-    memset(vin, 0, sizeof(vin)); /* for msan */                              \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(src_ptr, dst_u, dst_v, n, c);                                 \
-    }                                                                        \
-    memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP);           \
-    ANY_SIMD(vin, vout, vout + 256, MASK + 1, c);                            \
-    memcpy(dst_u + (ptrdiff_t)n, vout, (ptrdiff_t)r);                        \
-    memcpy(dst_v + (ptrdiff_t)n, vout + 256, (ptrdiff_t)r);                  \
+#define ANY12M(NAMEANY, ANY_SIMD, BPP, MASK)                           \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \
+               int width, const struct ArgbConstants* c) {             \
+    SIMD_ALIGNED(uint8_t vin[256]);                                    \
+    SIMD_ALIGNED(uint8_t vout[256 * 2]);                               \
+    memset(vin, 0, sizeof(vin)); /* for msan */                        \
+    int r = width & MASK;                                              \
+    int n = width & ~MASK;                                             \
+    if (n > 0) {                                                       \
+      ANY_SIMD(src_ptr, dst_u, dst_v, n, c);                           \
+    }                                                                  \
+    memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP);     \
+    ANY_SIMD(vin, vout, vout + 256, MASK + 1, c);                      \
+    memcpy(dst_u + (ptrdiff_t)n, vout, (ptrdiff_t)r);                  \
+    memcpy(dst_v + (ptrdiff_t)n, vout + 256, (ptrdiff_t)r);            \
   }
 
 #define ANY12MS(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                       \
-  void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u,        \
-               uint8_t* dst_v, int width, const struct ArgbConstants* c) {    \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u,       \
+               uint8_t* dst_v, int width, const struct ArgbConstants* c) {   \
     SIMD_ALIGNED(uint8_t vin[256 * 2]);                                      \
     SIMD_ALIGNED(uint8_t vout[256 * 2]);                                     \
-    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    memset(vin, 0, sizeof(vin));   /* for msan */                            \
     memset(vout, 0, sizeof(vout)); /* for msan */                            \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
@@ -2291,6 +2297,9 @@ ANY12MS(ARGB4444ToUVMatrixRow_Any_AVX2, ARGB4444ToUVMatrixRow_AVX2, 0, 2, 31)
 #ifdef HAS_ARGBTOUVMATRIXROW_AVX512BW
 ANY12MS(ARGBToUVMatrixRow_Any_AVX512BW, ARGBToUVMatrixRow_AVX512BW, 0, 4, 63)
 #endif
+#ifdef HAS_RGBTOUVMATRIXROW_AVX512BW
+ANY12MS(RGBToUVMatrixRow_Any_AVX512BW, RGBToUVMatrixRow_AVX512BW, 0, 3, 63)
+#endif
 #ifdef HAS_ARGBTOUVMATRIXROW_SSSE3
 ANY12MS(ARGBToUVMatrixRow_Any_SSSE3, ARGBToUVMatrixRow_SSSE3, 0, 4, 7)
 #endif
@@ -2307,20 +2316,20 @@ ANY12M(ARGBToUV444MatrixRow_Any_SSSE3, ARGBToUV444MatrixRow_SSSE3, 4, 15)
 ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7)
 #endif
 
-#define ANY11MC(NAMEANY, ANY_SIMD, BPP, MASK)                                \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width,          \
-               const struct ArgbConstants* c) {                              \
-    SIMD_ALIGNED(uint8_t vin[256]);                                          \
-    SIMD_ALIGNED(uint8_t vout[256]);                                         \
-    memset(vin, 0, sizeof(vin)); /* for msan */                              \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(src_ptr, dst_ptr, n, c);                                      \
-    }                                                                        \
-    memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP);           \
-    ANY_SIMD(vin, vout, MASK + 1, c);                                        \
-    memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r);                      \
+#define ANY11MC(NAMEANY, ANY_SIMD, BPP, MASK)                       \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width, \
+               const struct ArgbConstants* c) {                     \
+    SIMD_ALIGNED(uint8_t vin[256]);                                 \
+    SIMD_ALIGNED(uint8_t vout[256]);                                \
+    memset(vin, 0, sizeof(vin)); /* for msan */                     \
+    int r = width & MASK;                                           \
+    int n = width & ~MASK;                                          \
+    if (n > 0) {                                                    \
+      ANY_SIMD(src_ptr, dst_ptr, n, c);                             \
+    }                                                               \
+    memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP);  \
+    ANY_SIMD(vin, vout, MASK + 1, c);                               \
+    memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r);             \
   }
 
 #ifdef HAS_ARGBTOYROW_SSSE3
diff --git a/source/row_common.cc b/source/row_common.cc
index a18c90d12..f44b0f313 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -14,7 +14,7 @@
 #include <string.h>  // For memcpy and memset.
 
 #include "libyuv/basic_types.h"
-#include "libyuv/convert_argb.h"  // For kYuvI601Constants
+#include "libyuv/convert_argb.h"       // For kYuvI601Constants
 #include "libyuv/convert_from_argb.h"  // For ArgbConstants
 
 #ifdef __cplusplus
@@ -764,7 +764,7 @@ static __inline uint8_t RGBToUMatrix(uint8_t b0,
                                      uint8_t b3,
                                      const struct ArgbConstants* c) {
   return (c->kAddUV[0] - (c->kRGBToU[0] * b0 + c->kRGBToU[1] * b1 +
-                         c->kRGBToU[2] * b2 + c->kRGBToU[3] * b3)) >>
+                          c->kRGBToU[2] * b2 + c->kRGBToU[3] * b3)) >>
          8;
 }
 static __inline uint8_t RGBToVMatrix(uint8_t b0,
@@ -773,7 +773,7 @@ static __inline uint8_t RGBToVMatrix(uint8_t b0,
                                      uint8_t b3,
                                      const struct ArgbConstants* c) {
   return (c->kAddUV[0] - (c->kRGBToV[0] * b0 + c->kRGBToV[1] * b1 +
-                         c->kRGBToV[2] * b2 + c->kRGBToV[3] * b3)) >>
+                          c->kRGBToV[2] * b2 + c->kRGBToV[3] * b3)) >>
          8;
 }
 
@@ -783,7 +783,8 @@ void ARGBToYMatrixRow_C(const uint8_t* src_argb,
                         const struct ArgbConstants* c) {
   int x;
   for (x = 0; x < width; ++x) {
-    dst_y[0] = RGBToYMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
+    dst_y[0] =
+        RGBToYMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
     src_argb += 4;
     dst_y += 1;
   }
@@ -1513,18 +1514,18 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
       YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
 
-#define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV)   \
-  extern const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) =     \
-      ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV),   \
-                        -(RV), 0, AY, AUV);                                    \
-  extern const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) =     \
-      ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV),   \
-                        -(BV), 0, AY, AUV);                                    \
-  extern const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) =     \
-      ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV),       \
-                        -(GV), -(RV), AY, AUV);                                \
-  extern const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) =     \
-      ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV),       \
+#define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV) \
+  extern const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) =   \
+      ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV), \
+                        -(RV), 0, AY, AUV);                                  \
+  extern const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) =   \
+      ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV), \
+                        -(BV), 0, AY, AUV);                                  \
+  extern const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) =   \
+      ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV),     \
+                        -(GV), -(RV), AY, AUV);                              \
+  extern const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) =   \
+      ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV),     \
                         -(GV), -(BV), AY, AUV);
 
 // BT.601 limited range RGB to YUV coefficients
@@ -3467,7 +3468,7 @@ void ARGBBlendRow_C(const uint8_t* src_argb,
 }
 #undef BLEND
 
-#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
+#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8
 void BlendPlaneRow_C(const uint8_t* src0,
                      const uint8_t* src1,
                      const uint8_t* alpha,
@@ -4618,8 +4619,7 @@ void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     RGB24ToARGBRow_AVX2(src_rgb, row, twidth);
-    RGB24ToARGBRow_AVX2(src_rgb + src_stride_rgb,
-                        row + MAXTWIDTH * 4, twidth);
+    RGB24ToARGBRow_AVX2(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4, twidth);
     ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
     src_rgb += twidth * 3;
     dst_u += twidth / 2;
@@ -4629,6 +4629,29 @@ void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb,
 }
 #endif
 
+#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW) && \
+    defined(HAS_RGB24TOARGBROW_AVX512BW)
+void RGBToUVMatrixRow_AVX512BW(const uint8_t* src_rgb,
+                               int src_stride_rgb,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width,
+                               const struct ArgbConstants* c) {
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    RGB24ToARGBRow_AVX512BW(src_rgb, row, twidth);
+    RGB24ToARGBRow_AVX512BW(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4,
+                            twidth);
+    ARGBToUVMatrixRow_AVX512BW(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
+    src_rgb += twidth * 3;
+    dst_u += twidth / 2;
+    dst_v += twidth / 2;
+    width -= twidth;
+  }
+}
+#endif
+
 #if defined(HAS_ARGBTOUVMATRIXROW_NEON) && defined(HAS_RGB24TOARGBROW_NEON)
 void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb,
                            int src_stride_rgb,
@@ -4675,7 +4698,8 @@ void RGB565ToUVMatrixRow_C(const uint8_t* src_rgb565,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     RGB565ToARGBRow_C(src_rgb565, row, twidth);
-    RGB565ToARGBRow_C(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4, twidth);
+    RGB565ToARGBRow_C(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4,
+                      twidth);
     ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
     src_rgb565 += twidth * 2;
     dst_u += twidth / 2;
@@ -4712,8 +4736,8 @@ void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     RGB565ToARGBRow_AVX2(src_rgb565, row, twidth);
-    RGB565ToARGBRow_AVX2(src_rgb565 + src_stride_rgb565,
-                         row + MAXTWIDTH * 4, twidth);
+    RGB565ToARGBRow_AVX2(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4,
+                         twidth);
     ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
     src_rgb565 += twidth * 2;
     dst_u += twidth / 2;
@@ -4751,7 +4775,8 @@ void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     RGB565ToARGBRow_NEON(src_rgb565, row, twidth);
-    RGB565ToARGBRow_NEON(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4, twidth);
+    RGB565ToARGBRow_NEON(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4,
+                         twidth);
     ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
     src_rgb565 += twidth * 2;
     dst_u += twidth / 2;
@@ -4786,7 +4811,8 @@ void ARGB1555ToUVMatrixRow_C(const uint8_t* src_argb1555,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     ARGB1555ToARGBRow_C(src_argb1555, row, twidth);
-    ARGB1555ToARGBRow_C(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4, twidth);
+    ARGB1555ToARGBRow_C(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4,
+                        twidth);
     ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
     src_argb1555 += twidth * 2;
     dst_u += twidth / 2;
@@ -4820,7 +4846,8 @@ void ARGB4444ToUVMatrixRow_C(const uint8_t* src_argb4444,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     ARGB4444ToARGBRow_C(src_argb4444, row, twidth);
-    ARGB4444ToARGBRow_C(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4, twidth);
+    ARGB4444ToARGBRow_C(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4,
+                        twidth);
     ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
     src_argb4444 += twidth * 2;
     dst_u += twidth / 2;
@@ -4956,7 +4983,8 @@ void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     ARGB1555ToARGBRow_NEON(src_argb1555, row, twidth);
-    ARGB1555ToARGBRow_NEON(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4, twidth);
+    ARGB1555ToARGBRow_NEON(src_argb1555 + src_stride_argb1555,
+                           row + MAXTWIDTH * 4, twidth);
     ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
     src_argb1555 += twidth * 2;
     dst_u += twidth / 2;
@@ -4977,7 +5005,8 @@ void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444,
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     ARGB4444ToARGBRow_NEON(src_argb4444, row, twidth);
-    ARGB4444ToARGBRow_NEON(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4, twidth);
+    ARGB4444ToARGBRow_NEON(src_argb4444 + src_stride_argb4444,
+                           row + MAXTWIDTH * 4, twidth);
     ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c);
     src_argb4444 += twidth * 2;
     dst_u += twidth / 2;
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index e37e58b01..10ecf5910 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/convert_from_argb.h"  // For ArgbConstants
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -120,11 +120,11 @@ static const lvec8 kShuffleNV21 = {
 
 #if defined(HAS_J400TOARGBROW_AVX2) || defined(HAS_J400TOARGBROW_AVX512BW)
 alignas(64) static const uint8_t kShuffleMaskJ400ToARGB[64] = {
-    0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, 3u, 128u,
-    4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u,
-    8u, 8u, 8u, 128u, 9u, 9u, 9u, 128u, 10u, 10u, 10u, 128u, 11u, 11u, 11u, 128u,
-    12u, 12u, 12u, 128u, 13u, 13u, 13u, 128u, 14u, 14u, 14u, 128u, 15u, 15u, 15u, 128u
-};
+    0u,  0u,   0u,  128u, 1u,  1u,   1u,  128u, 2u,  2u,   2u,  128u, 3u,  3u,
+    3u,  128u, 4u,  4u,   4u,  128u, 5u,  5u,   5u,  128u, 6u,  6u,   6u,  128u,
+    7u,  7u,   7u,  128u, 8u,  8u,   8u,  128u, 9u,  9u,   9u,  128u, 10u, 10u,
+    10u, 128u, 11u, 11u,  11u, 128u, 12u, 12u,  12u, 128u, 13u, 13u,  13u, 128u,
+    14u, 14u,  14u, 128u, 15u, 15u,  15u, 128u};
 #endif
 
 #ifdef HAS_J400TOARGBROW_AVX2
@@ -149,16 +149,18 @@ void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper  \n"
-      : "+r"(src_y),     // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(kShuffleMaskJ400ToARGB) // %3
+      : "+r"(src_y),                 // %0
+        "+r"(dst_argb),              // %1
+        "+r"(width)                  // %2
+      : "r"(kShuffleMaskJ400ToARGB)  // %3
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
 }
 #endif  // HAS_J400TOARGBROW_AVX2
 
 #ifdef HAS_J400TOARGBROW_AVX512BW
-void J400ToARGBRow_AVX512BW(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+void J400ToARGBRow_AVX512BW(const uint8_t* src_y,
+                            uint8_t* dst_argb,
+                            int width) {
   asm volatile(
       "vpternlogd  $0xff,%%zmm7,%%zmm7,%%zmm7    \n"  // 0xffffffff
       "vpslld      $0x18,%%zmm7,%%zmm7           \n"  // 0xff000000
@@ -179,10 +181,10 @@ void J400ToARGBRow_AVX512BW(const uint8_t* src_y, uint8_t* dst_argb, int width)
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper  \n"
-      : "+r"(src_y),     // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "m"(kShuffleMaskJ400ToARGB) // %3
+      : "+r"(src_y),                 // %0
+        "+r"(dst_argb),              // %1
+        "+r"(width)                  // %2
+      : "m"(kShuffleMaskJ400ToARGB)  // %3
       : "memory", "cc", "xmm0", "xmm1", "xmm5", "xmm7");
 }
 #endif  // HAS_J400TOARGBROW_AVX512BW
@@ -221,15 +223,17 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
       "lea         0x40(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-      : "+r"(src_rgb24),              // %0
-        "+r"(dst_argb),               // %1
-        "+r"(width)                   // %2
+      : "+r"(src_rgb24),                 // %0
+        "+r"(dst_argb),                  // %1
+        "+r"(width)                      // %2
       : "m"(kShuffleMaskRGB24ToARGB[0])  // %3
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
 #ifdef HAS_RGB24TOARGBROW_AVX2
-void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
+void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width) {
   // Reference to prevent discarding of kShuffleMaskRGB24ToARGB[1] which is
   // accessed via offset in assembly.
   const uvec8* dummy = &kShuffleMaskRGB24ToARGB[1];
@@ -267,9 +271,9 @@ void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width)
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper  \n"
-      : "+r"(src_rgb24),              // %0
-        "+r"(dst_argb),               // %1
-        "+r"(width)                   // %2
+      : "+r"(src_rgb24),                 // %0
+        "+r"(dst_argb),                  // %1
+        "+r"(width)                      // %2
       : "m"(kShuffleMaskRGB24ToARGB[0])  // %3
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
@@ -358,7 +362,10 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
 static const uint32_t kPermdRAWToARGB_AVX512BW[16] = {
     0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
 
-void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint32_t* shuffler, int width) {
+void RGBToARGBRow_AVX512BW(const uint8_t* src_raw,
+                           uint8_t* dst_argb,
+                           const uint32_t* shuffler,
+                           int width) {
   asm volatile(
       "vpternlogd  $0xff,%%zmm6,%%zmm6,%%zmm6    \n"  // 0xffffffff
       "vpslld      $0x18,%%zmm6,%%zmm6           \n"  // 0xff000000
@@ -399,14 +406,20 @@ void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint
         "+r"(width)                     // %2
       : "m"(kPermdRAWToARGB_AVX512BW),  // %3
         "m"(*shuffler)                  // %4
-      : "memory", "cc", "rax", "k1", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+      : "memory", "cc", "rax", "k1", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+        "xmm5", "xmm6");
 }
 
-void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  RGBToARGBRow_AVX512BW(src_raw, dst_argb, (const uint32_t*)&kShuffleMaskRAWToARGB, width);
+void RAWToARGBRow_AVX512BW(const uint8_t* src_raw,
+                           uint8_t* dst_argb,
+                           int width) {
+  RGBToARGBRow_AVX512BW(src_raw, dst_argb,
+                        (const uint32_t*)&kShuffleMaskRAWToARGB, width);
 }
 
-void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
+void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24,
+                             uint8_t* dst_argb,
+                             int width) {
   RGBToARGBRow_AVX512BW(src_rgb24, dst_argb,
                         (const uint32_t*)&kShuffleMaskRGB24ToARGB[0], width);
 }
@@ -622,35 +635,35 @@ void ARGB4444ToARGBRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 #endif
 
 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-      asm volatile("movdqa      %3,%%xmm6                     \n"
+  asm volatile("movdqa      %3,%%xmm6                     \n"
 
                LABELALIGN
-      "1:          \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x30(%0),%%xmm3               \n"
-      "lea         0x40(%0),%0                   \n"
-      "pshufb      %%xmm6,%%xmm0                 \n"
-      "pshufb      %%xmm6,%%xmm1                 \n"
-      "pshufb      %%xmm6,%%xmm2                 \n"
-      "pshufb      %%xmm6,%%xmm3                 \n"
-      "movdqa      %%xmm1,%%xmm4                 \n"
-      "psrldq      $0x4,%%xmm1                   \n"
-      "pslldq      $0xc,%%xmm4                   \n"
-      "movdqa      %%xmm2,%%xmm5                 \n"
-      "por         %%xmm4,%%xmm0                 \n"
-      "pslldq      $0x8,%%xmm5                   \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "por         %%xmm5,%%xmm1                 \n"
-      "psrldq      $0x8,%%xmm2                   \n"
-      "pslldq      $0x4,%%xmm3                   \n"
-      "por         %%xmm3,%%xmm2                 \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "movdqu      %%xmm2,0x20(%1)               \n"
-      "lea         0x30(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
+               "1:          \n"
+               "movdqu      (%0),%%xmm0                   \n"
+               "movdqu      0x10(%0),%%xmm1               \n"
+               "movdqu      0x20(%0),%%xmm2               \n"
+               "movdqu      0x30(%0),%%xmm3               \n"
+               "lea         0x40(%0),%0                   \n"
+               "pshufb      %%xmm6,%%xmm0                 \n"
+               "pshufb      %%xmm6,%%xmm1                 \n"
+               "pshufb      %%xmm6,%%xmm2                 \n"
+               "pshufb      %%xmm6,%%xmm3                 \n"
+               "movdqa      %%xmm1,%%xmm4                 \n"
+               "psrldq      $0x4,%%xmm1                   \n"
+               "pslldq      $0xc,%%xmm4                   \n"
+               "movdqa      %%xmm2,%%xmm5                 \n"
+               "por         %%xmm4,%%xmm0                 \n"
+               "pslldq      $0x8,%%xmm5                   \n"
+               "movdqu      %%xmm0,(%1)                   \n"
+               "por         %%xmm5,%%xmm1                 \n"
+               "psrldq      $0x8,%%xmm2                   \n"
+               "pslldq      $0x4,%%xmm3                   \n"
+               "por         %%xmm3,%%xmm2                 \n"
+               "movdqu      %%xmm1,0x10(%1)               \n"
+               "movdqu      %%xmm2,0x20(%1)               \n"
+               "lea         0x30(%1),%1                   \n"
+               "sub         $0x10,%2                      \n"
+               "jg          1b                            \n"
                : "+r"(src),                    // %0
                  "+r"(dst),                    // %1
                  "+r"(width)                   // %2
@@ -660,35 +673,35 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-      asm volatile("movdqa      %3,%%xmm6                     \n"
+  asm volatile("movdqa      %3,%%xmm6                     \n"
 
                LABELALIGN
-      "1:          \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x30(%0),%%xmm3               \n"
-      "lea         0x40(%0),%0                   \n"
-      "pshufb      %%xmm6,%%xmm0                 \n"
-      "pshufb      %%xmm6,%%xmm1                 \n"
-      "pshufb      %%xmm6,%%xmm2                 \n"
-      "pshufb      %%xmm6,%%xmm3                 \n"
-      "movdqa      %%xmm1,%%xmm4                 \n"
-      "psrldq      $0x4,%%xmm1                   \n"
-      "pslldq      $0xc,%%xmm4                   \n"
-      "movdqa      %%xmm2,%%xmm5                 \n"
-      "por         %%xmm4,%%xmm0                 \n"
-      "pslldq      $0x8,%%xmm5                   \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "por         %%xmm5,%%xmm1                 \n"
-      "psrldq      $0x8,%%xmm2                   \n"
-      "pslldq      $0x4,%%xmm3                   \n"
-      "por         %%xmm3,%%xmm2                 \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "movdqu      %%xmm2,0x20(%1)               \n"
-      "lea         0x30(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
+               "1:          \n"
+               "movdqu      (%0),%%xmm0                   \n"
+               "movdqu      0x10(%0),%%xmm1               \n"
+               "movdqu      0x20(%0),%%xmm2               \n"
+               "movdqu      0x30(%0),%%xmm3               \n"
+               "lea         0x40(%0),%0                   \n"
+               "pshufb      %%xmm6,%%xmm0                 \n"
+               "pshufb      %%xmm6,%%xmm1                 \n"
+               "pshufb      %%xmm6,%%xmm2                 \n"
+               "pshufb      %%xmm6,%%xmm3                 \n"
+               "movdqa      %%xmm1,%%xmm4                 \n"
+               "psrldq      $0x4,%%xmm1                   \n"
+               "pslldq      $0xc,%%xmm4                   \n"
+               "movdqa      %%xmm2,%%xmm5                 \n"
+               "por         %%xmm4,%%xmm0                 \n"
+               "pslldq      $0x8,%%xmm5                   \n"
+               "movdqu      %%xmm0,(%1)                   \n"
+               "por         %%xmm5,%%xmm1                 \n"
+               "psrldq      $0x8,%%xmm2                   \n"
+               "pslldq      $0x4,%%xmm3                   \n"
+               "por         %%xmm3,%%xmm2                 \n"
+               "movdqu      %%xmm1,0x10(%1)               \n"
+               "movdqu      %%xmm2,0x20(%1)               \n"
+               "lea         0x30(%1),%1                   \n"
+               "sub         $0x10,%2                      \n"
+               "jg          1b                            \n"
                : "+r"(src),                  // %0
                  "+r"(dst),                  // %1
                  "+r"(width)                 // %2
@@ -1153,21 +1166,21 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
 void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
                          uint8_t* dst_argb,
                          int width) {
-      asm volatile("movdqa      %3,%%xmm2                     \n"
+  asm volatile("movdqa      %3,%%xmm2                     \n"
 
                LABELALIGN
-      "1:          \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "psrlw       $8,%%xmm0                     \n"
-      "psrlw       $8,%%xmm1                     \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "pshufb      %%xmm2,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x20(%0),%0                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
+               "1:          \n"
+               "movdqu      (%0),%%xmm0                   \n"
+               "movdqu      0x10(%0),%%xmm1               \n"
+               "psrlw       $8,%%xmm0                     \n"
+               "psrlw       $8,%%xmm1                     \n"
+               "packuswb    %%xmm1,%%xmm0                 \n"
+               "pshufb      %%xmm2,%%xmm0                 \n"
+               "movdqu      %%xmm0,(%1)                   \n"
+               "lea         0x20(%0),%0                   \n"
+               "lea         0x10(%1),%1                   \n"
+               "sub         $0x4,%2                       \n"
+               "jg          1b                            \n"
                : "+r"(src_ab64),          // %0
                  "+r"(dst_argb),          // %1
                  "+r"(width)              // %2
@@ -1258,21 +1271,21 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
 void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
                         uint8_t* dst_argb,
                         int width) {
-      asm volatile("vbroadcasti128 %3,%%ymm2                  \n" LABELALIGN
-      "1:          \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "vpsrlw      $8,%%ymm0,%%ymm0              \n"
-      "vpsrlw      $8,%%ymm1,%%ymm1              \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x40(%0),%0                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+  asm volatile("vbroadcasti128 %3,%%ymm2                  \n" LABELALIGN
+               "1:          \n"
+               "vmovdqu     (%0),%%ymm0                   \n"
+               "vmovdqu     0x20(%0),%%ymm1               \n"
+               "vpsrlw      $8,%%ymm0,%%ymm0              \n"
+               "vpsrlw      $8,%%ymm1,%%ymm1              \n"
+               "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+               "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+               "vpshufb     %%ymm2,%%ymm0,%%ymm0          \n"
+               "vmovdqu     %%ymm0,(%1)                   \n"
+               "lea         0x40(%0),%0                   \n"
+               "lea         0x20(%1),%1                   \n"
+               "sub         $0x8,%2                       \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : "+r"(src_ab64),          // %0
                  "+r"(dst_argb),          // %1
                  "+r"(width)              // %2
@@ -1452,9 +1465,7 @@ void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb,
       "movdqa      %%xmm4,%%xmm6                 \n"
       "pmaddubsw   %%xmm5,%%xmm6                 \n"
       "phaddw      %%xmm6,%%xmm6                 \n"
-      "psubw       %%xmm6,%%xmm7                 \n"
-      LABELALIGN ""
-      RGBTOY(xmm7)
+      "psubw       %%xmm6,%%xmm7                 \n" LABELALIGN "" RGBTOY(xmm7)
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1478,10 +1489,8 @@ void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
       "vpmaddubsw  %%ymm5,%%ymm4,%%ymm6          \n"
       "vphaddw     %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsubw      %%ymm6,%%ymm7,%%ymm7          \n"
-      "vmovdqa     %4,%%ymm6                     \n"
-      LABELALIGN ""
-      RGBTOY_AVX2(ymm7)
-      "vzeroupper  \n"
+      "vmovdqa     %4,%%ymm6                     \n" LABELALIGN
+      "" RGBTOY_AVX2(ymm7) "vzeroupper  \n"
       : "+r"(src_argb),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
@@ -1492,8 +1501,9 @@ void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
 }
 #endif
 
-#if defined(HAS_ARGBTOYROW_AVX512BW) || defined(HAS_ARGBTOUV444ROW_AVX512BW) || defined(HAS_ARGBTOUVROW_AVX512BW)
-static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8, 12, 1, 5, 9, 13,
+#if defined(HAS_ARGBTOYROW_AVX512BW) || \
+    defined(HAS_ARGBTOUV444ROW_AVX512BW) || defined(HAS_ARGBTOUVROW_AVX512BW)
+static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8,  12, 1, 5, 9,  13,
                                                     2, 6, 10, 14, 3, 7, 11, 15};
 #endif
 
@@ -1511,15 +1521,14 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb,
       "vpternlogd  $0xff,%%zmm16,%%zmm16,%%zmm16 \n"
       "vpsllw      $15,%%zmm16,%%zmm5            \n"
       "vpacksswb   %%zmm5,%%zmm5,%%zmm5          \n"
-      "vpsrlw      $15,%%zmm16,%%zmm16           \n" // zmm16 = 1
+      "vpsrlw      $15,%%zmm16,%%zmm16           \n"  // zmm16 = 1
       "vbroadcasti64x4 0(%3),%%zmm4              \n"
       "vbroadcasti64x4 0x60(%3),%%zmm7           \n"
       "vpmaddubsw  %%zmm5,%%zmm4,%%zmm6          \n"
       "vpmaddwd    %%zmm16,%%zmm6,%%zmm6         \n"
       "vpackssdw   %%zmm6,%%zmm6,%%zmm6          \n"
       "vpsubw      %%zmm6,%%zmm7,%%zmm7          \n"
-      "vmovups     %4,%%zmm6                     \n"
-      LABELALIGN
+      "vmovups     %4,%%zmm6                     \n" LABELALIGN
       "1:          \n"
       "vmovups     (%0),%%zmm0                   \n"
       "vmovups     0x40(%0),%%zmm1               \n"
@@ -1551,11 +1560,11 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb,
       "sub         $0x40,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper  \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_y),     // %1
-        "+r"(width)      // %2
-      : "r"(c),          // %3
-        "m"(kPermdARGBToY_AVX512BW) // %4
+      : "+r"(src_argb),              // %0
+        "+r"(dst_y),                 // %1
+        "+r"(width)                  // %2
+      : "r"(c),                      // %3
+        "m"(kPermdARGBToY_AVX512BW)  // %4
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
         "xmm7", "xmm16");
 }
@@ -1713,8 +1722,8 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb,
   asm volatile(
       "vbroadcasti64x4 0x20(%4),%%zmm3               \n"  // kRGBToU
       "vbroadcasti64x4 0x40(%4),%%zmm4               \n"  // kRGBToV
-      "vpternlogd  $0xff,%%zmm16,%%zmm16,%%zmm16 \n"  // -1
-      "vpsllw      $15,%%zmm16,%%zmm5            \n"  // 0x8000
+      "vpternlogd  $0xff,%%zmm16,%%zmm16,%%zmm16 \n"      // -1
+      "vpsllw      $15,%%zmm16,%%zmm5            \n"      // 0x8000
       "vmovups     %5,%%zmm7                     \n"
       "sub         %1,%2                         \n"
 
@@ -1874,8 +1883,8 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
                             int width,
                             const struct ArgbConstants* c) {
   asm volatile(
-      "vbroadcasti128 0x20(%5),%%ymm4           \n"  // RGBToU
-      "vbroadcasti128 0x40(%5),%%ymm5           \n"  // RGBToV
+      "vbroadcasti128 0x20(%5),%%ymm4           \n"   // RGBToU
+      "vbroadcasti128 0x40(%5),%%ymm5           \n"   // RGBToV
       "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 0x0101
       "vpabsb      %%ymm6,%%ymm6                 \n"
       "vmovdqa     %6,%%ymm7                     \n"  // kShuffleAARRGGBB
@@ -2174,8 +2183,8 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb,
       "vbroadcasti64x4 0x20(%5),%%zmm4               \n"  // RGBToU
       "vbroadcasti64x4 0x40(%5),%%zmm5               \n"  // RGBToV
       "vpternlogd  $0xff,%%zmm16,%%zmm16,%%zmm16 \n"
-      "vpabsb      %%zmm16,%%zmm6                \n"  // 0x0101
-      "vpsllw      $15,%%zmm16,%%zmm17           \n"  // 0x8000
+      "vpabsb      %%zmm16,%%zmm6                \n"      // 0x0101
+      "vpsllw      $15,%%zmm16,%%zmm17           \n"      // 0x8000
       "vbroadcasti64x4 %6,%%zmm7                     \n"  // kShuffleAARRGGBB
       "vmovups     %7,%%zmm18                    \n"  // kPermdARGBToY_AVX512BW
       "vmovups     %8,%%zmm19                    \n"  // kPermdARGBToUV_AVX512BW
@@ -2209,7 +2218,8 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb,
       "vpmaddubsw  %%zmm5,%%zmm0,%%zmm0          \n"  // 16 V
       "vpmaddwd    %%zmm16,%%zmm1,%%zmm1         \n"
       "vpmaddwd    %%zmm16,%%zmm0,%%zmm0         \n"
-      "vpackssdw   %%zmm0,%%zmm1,%%zmm0          \n"  // mutates (U in lower, V in upper)
+      "vpackssdw   %%zmm0,%%zmm1,%%zmm0          \n"  // mutates (U in lower, V
+                                                      // in upper)
       "vpaddw      %%zmm17,%%zmm0,%%zmm0         \n"
       "vpsrlw      $0x8,%%zmm0,%%zmm0            \n"
       "vpackuswb   %%zmm0,%%zmm0,%%zmm0          \n"  // mutates
@@ -2659,12 +2669,12 @@ void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
   asm volatile(YUVTORGB_SETUP(
-      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-      LABELALIGN "1:          \n" READYUVA444 YUVTORGB(yuvconstants)
+               LABELALIGN "1:          \n" READYUVA444 YUVTORGB(yuvconstants)
                    STOREARGB
-      "subl        $0x8,%[width]                 \n"
-      "jg          1b                            \n"
+               "subl        $0x8,%[width]                 \n"
+               "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),        // %[y_buf]
                  [u_buf] "+r"(u_buf),        // %[u_buf]
                  [v_buf] "+r"(v_buf),        // %[v_buf]
@@ -2985,12 +2995,12 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
   asm volatile(YUVTORGB_SETUP(
-      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-      LABELALIGN "1:          \n" READYUVA210 YUVTORGB(yuvconstants)
+               LABELALIGN "1:          \n" READYUVA210 YUVTORGB(yuvconstants)
                    STOREARGB
-      "subl        $0x8,%[width]                 \n"
-      "jg          1b                            \n"
+               "subl        $0x8,%[width]                 \n"
+               "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),  // %[y_buf]
                  [u_buf] "+r"(u_buf),  // %[u_buf]
                  [v_buf] "+r"(v_buf),  // %[v_buf]
@@ -3017,12 +3027,12 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
   asm volatile(YUVTORGB_SETUP(
-      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-      LABELALIGN "1:          \n" READYUVA410 YUVTORGB(yuvconstants)
+               LABELALIGN "1:          \n" READYUVA410 YUVTORGB(yuvconstants)
                    STOREARGB
-      "subl        $0x8,%[width]                 \n"
-      "jg          1b                            \n"
+               "subl        $0x8,%[width]                 \n"
+               "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),  // %[y_buf]
                  [u_buf] "+r"(u_buf),  // %[u_buf]
                  [v_buf] "+r"(v_buf),  // %[v_buf]
@@ -3083,12 +3093,12 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
   asm volatile(YUVTORGB_SETUP(
-      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-      LABELALIGN "1:          \n" READYUVA422 YUVTORGB(yuvconstants)
+               LABELALIGN "1:          \n" READYUVA422 YUVTORGB(yuvconstants)
                    STOREARGB
-      "subl        $0x8,%[width]                 \n"
-      "jg          1b                            \n"
+               "subl        $0x8,%[width]                 \n"
+               "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),        // %[y_buf]
                  [u_buf] "+r"(u_buf),        // %[u_buf]
                  [v_buf] "+r"(v_buf),        // %[v_buf]
@@ -3111,12 +3121,12 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile(YUVTORGB_SETUP(
-      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+                   yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
-      LABELALIGN "1:          \n" READNV12 YUVTORGB(yuvconstants)
+               LABELALIGN "1:          \n" READNV12 YUVTORGB(yuvconstants)
                    STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
+               "sub         $0x8,%[width]                 \n"
+               "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),              // %[y_buf]
                  [uv_buf] "+r"(uv_buf),            // %[uv_buf]
                  [dst_argb] "+r"(dst_argb),        // %[dst_argb]
@@ -3132,12 +3142,12 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile(YUVTORGB_SETUP(
-      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+                   yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
-      LABELALIGN "1:          \n" READNV21 YUVTORGB(yuvconstants)
+               LABELALIGN "1:          \n" READNV21 YUVTORGB(yuvconstants)
                    STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
+               "sub         $0x8,%[width]                 \n"
+               "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),               // %[y_buf]
                  [vu_buf] "+r"(vu_buf),             // %[vu_buf]
                  [dst_argb] "+r"(dst_argb),         // %[dst_argb]
@@ -3155,7 +3165,7 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
   asm volatile(
       "movdqa      %[kShuffleYUY2Y],%%xmm6       \n"
       "movdqa      %[kShuffleYUY2UV],%%xmm7      \n" YUVTORGB_SETUP(
-      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+          yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       LABELALIGN "1:          \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB
       "sub         $0x8,%[width]                 \n"
@@ -3176,7 +3186,7 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
   asm volatile(
       "movdqa      %[kShuffleUYVYY],%%xmm6       \n"
       "movdqa      %[kShuffleUYVYUV],%%xmm7      \n" YUVTORGB_SETUP(
-      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+          yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       LABELALIGN "1:          \n" READUYVY YUVTORGB(yuvconstants) STOREARGB
       "sub         $0x8,%[width]                 \n"
@@ -3196,12 +3206,12 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile(YUVTORGB_SETUP(
-      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+                   yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
-      LABELALIGN "1:          \n" READP210 YUVTORGB(yuvconstants)
+               LABELALIGN "1:          \n" READP210 YUVTORGB(yuvconstants)
                    STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
+               "sub         $0x8,%[width]                 \n"
+               "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),              // %[y_buf]
                  [uv_buf] "+r"(uv_buf),            // %[u_buf]
                  [dst_argb] "+r"(dst_argb),        // %[dst_argb]
@@ -3217,12 +3227,12 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile(YUVTORGB_SETUP(
-      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
+                   yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
-      LABELALIGN "1:          \n" READP410 YUVTORGB(yuvconstants)
+               LABELALIGN "1:          \n" READP410 YUVTORGB(yuvconstants)
                    STOREARGB
-      "sub         $0x8,%[width]                 \n"
-      "jg          1b                            \n"
+               "sub         $0x8,%[width]                 \n"
+               "jg          1b                            \n"
                : [y_buf] "+r"(y_buf),              // %[y_buf]
                  [uv_buf] "+r"(uv_buf),            // %[u_buf]
                  [dst_argb] "+r"(dst_argb),        // %[dst_argb]
@@ -4045,13 +4055,13 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-      LABELALIGN "1:          \n" READYUVA210_AVX2 YUVTORGB_AVX2(
+               LABELALIGN "1:          \n" READYUVA210_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-      "subl        $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "subl        $0x10,%[width]                \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
 
                : [y_buf] "+r"(y_buf),        // %[y_buf]
                  [u_buf] "+r"(u_buf),        // %[u_buf]
@@ -4080,13 +4090,13 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-      LABELALIGN "1:          \n" READYUVA410_AVX2 YUVTORGB_AVX2(
+               LABELALIGN "1:          \n" READYUVA410_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-      "subl        $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "subl        $0x10,%[width]                \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
 
                : [y_buf] "+r"(y_buf),        // %[y_buf]
                  [u_buf] "+r"(u_buf),        // %[u_buf]
@@ -4155,13 +4165,13 @@ void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-      LABELALIGN "1:          \n" READYUVA444_AVX2 YUVTORGB_AVX2(
+               LABELALIGN "1:          \n" READYUVA444_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-      "subl        $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "subl        $0x10,%[width]                \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : [y_buf] "+r"(y_buf),        // %[y_buf]
                  [u_buf] "+r"(u_buf),        // %[u_buf]
                  [v_buf] "+r"(v_buf),        // %[v_buf]
@@ -4189,13 +4199,13 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
+                   yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-      LABELALIGN "1:          \n" READYUVA422_AVX2 YUVTORGB_AVX2(
+               LABELALIGN "1:          \n" READYUVA422_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-      "subl        $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "subl        $0x10,%[width]                \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : [y_buf] "+r"(y_buf),        // %[y_buf]
                  [u_buf] "+r"(u_buf),        // %[u_buf]
                  [v_buf] "+r"(v_buf),        // %[v_buf]
@@ -4265,13 +4275,13 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+                   yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
-      LABELALIGN "1:          \n" READNV12_AVX2 YUVTORGB_AVX2(
+               LABELALIGN "1:          \n" READNV12_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "sub         $0x10,%[width]                \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : [y_buf] "+r"(y_buf),              // %[y_buf]
                  [uv_buf] "+r"(uv_buf),            // %[uv_buf]
                  [dst_argb] "+r"(dst_argb),        // %[dst_argb]
@@ -4291,13 +4301,13 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+                   yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
-      LABELALIGN "1:          \n" READNV21_AVX2 YUVTORGB_AVX2(
+               LABELALIGN "1:          \n" READNV21_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "sub         $0x10,%[width]                \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : [y_buf] "+r"(y_buf),               // %[y_buf]
                  [vu_buf] "+r"(vu_buf),             // %[vu_buf]
                  [dst_argb] "+r"(dst_argb),         // %[dst_argb]
@@ -4319,7 +4329,7 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
   asm volatile(
       "vbroadcasti128 %[kShuffleYUY2Y],%%ymm6    \n"
       "vbroadcasti128 %[kShuffleYUY2UV],%%ymm7   \n" YUVTORGB_SETUP_AVX2(
-      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+          yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN "1:          \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants)
           STOREARGB_AVX2
@@ -4346,7 +4356,7 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
   asm volatile(
       "vbroadcasti128 %[kShuffleUYVYY],%%ymm6    \n"
       "vbroadcasti128 %[kShuffleUYVYUV],%%ymm7   \n" YUVTORGB_SETUP_AVX2(
-      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+          yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN "1:          \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants)
           STOREARGB_AVX2
@@ -4372,13 +4382,13 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+                   yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
-      LABELALIGN "1:          \n" READP210_AVX2 YUVTORGB_AVX2(
+               LABELALIGN "1:          \n" READP210_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "sub         $0x10,%[width]                \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : [y_buf] "+r"(y_buf),              // %[y_buf]
                  [uv_buf] "+r"(uv_buf),            // %[uv_buf]
                  [dst_argb] "+r"(dst_argb),        // %[dst_argb]
@@ -4398,13 +4408,13 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile(YUVTORGB_SETUP_AVX2(
-      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+                   yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
-      LABELALIGN "1:          \n" READP410_AVX2 YUVTORGB_AVX2(
+               LABELALIGN "1:          \n" READP410_AVX2 YUVTORGB_AVX2(
                    yuvconstants) STOREARGB_AVX2
-      "sub         $0x10,%[width]                \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "sub         $0x10,%[width]                \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : [y_buf] "+r"(y_buf),              // %[y_buf]
                  [uv_buf] "+r"(uv_buf),            // %[uv_buf]
                  [dst_argb] "+r"(dst_argb),        // %[dst_argb]
@@ -4583,16 +4593,16 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
 
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
   ptrdiff_t temp_width = (ptrdiff_t)(width);
-      asm volatile("movdqa      %3,%%xmm5                     \n"
+  asm volatile("movdqa      %3,%%xmm5                     \n"
 
                LABELALIGN
-      "1:          \n"
-      "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
-      "pshufb      %%xmm5,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
+               "1:          \n"
+               "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
+               "pshufb      %%xmm5,%%xmm0                 \n"
+               "movdqu      %%xmm0,(%1)                   \n"
+               "lea         0x10(%1),%1                   \n"
+               "sub         $0x10,%2                      \n"
+               "jg          1b                            \n"
                : "+r"(src),           // %0
                  "+r"(dst),           // %1
                  "+r"(temp_width)     // %2
@@ -4601,21 +4611,44 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 }
 #endif  // HAS_MIRRORROW_SSSE3
 
+#ifdef HAS_MIRRORROW_AVX512BW
+void MirrorRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) {
+  ptrdiff_t temp_width = (ptrdiff_t)(width);
+  asm volatile("vbroadcasti32x4 %3,%%zmm5                 \n"
+
+               LABELALIGN
+               "1:          \n"
+               "vmovdqu8    -0x40(%0,%2,1),%%zmm0         \n"
+               "vpshufb     %%zmm5,%%zmm0,%%zmm0          \n"
+               "vshufi64x2  $0x1b,%%zmm0,%%zmm0,%%zmm0    \n"
+               "vmovdqu8    %%zmm0,(%1)                   \n"
+               "lea         0x40(%1),%1                   \n"
+               "sub         $0x40,%2                      \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
+               : "+r"(src),           // %0
+                 "+r"(dst),           // %1
+                 "+r"(temp_width)     // %2
+               : "m"(kShuffleMirror)  // %3
+               : "memory", "cc", "zmm0", "zmm5");
+}
+#endif  // HAS_MIRRORROW_AVX512BW
+
 #ifdef HAS_MIRRORROW_AVX2
 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   ptrdiff_t temp_width = (ptrdiff_t)(width);
-      asm volatile("vbroadcasti128 %3,%%ymm5                  \n"
+  asm volatile("vbroadcasti128 %3,%%ymm5                  \n"
 
                LABELALIGN
-      "1:          \n"
-      "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
-      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "1:          \n"
+               "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
+               "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+               "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
+               "vmovdqu     %%ymm0,(%1)                   \n"
+               "lea         0x20(%1),%1                   \n"
+               "sub         $0x20,%2                      \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : "+r"(src),           // %0
                  "+r"(dst),           // %1
                  "+r"(temp_width)     // %2
@@ -4624,11 +4657,50 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 }
 #endif  // HAS_MIRRORROW_AVX2
 
-#ifdef HAS_MIRRORSPLITUVROW_AVX2
+#if defined(HAS_MIRRORSPLITUVROW_AVX2) || defined(HAS_MIRRORSPLITUVROW_AVX512BW)
 // Shuffle table for reversing the bytes of UV channels.
 static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
                                             15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+#endif
 
+#ifdef HAS_MIRRORSPLITUVROW_AVX512BW
+static const uint64_t kMirrorSplitUVPermute[8] = {6, 4, 2, 0, 7, 5, 3, 1};
+
+void MirrorSplitUVRow_AVX512BW(const uint8_t* src,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width) {
+  ptrdiff_t temp_width = (ptrdiff_t)(width);
+  asm volatile(
+      "vbroadcasti32x4 %4,%%zmm1                 \n"
+      "lea         -0x40(%0,%3,2),%0             \n"
+      "sub         %1,%2                         \n"
+      "vmovdqu64   %5,%%zmm3                     \n"
+
+      LABELALIGN
+      "1:          \n"
+      "vmovdqu8    (%0),%%zmm0                   \n"
+      "lea         -0x40(%0),%0                  \n"
+      "vpshufb     %%zmm1,%%zmm0,%%zmm0          \n"
+      "vpermq      %%zmm0,%%zmm3,%%zmm0          \n"
+      "vextracti64x4 $0x1,%%zmm0,%%ymm2          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "vmovdqu     %%ymm2,0x00(%1,%2,1)          \n"
+      "lea         0x20(%1),%1                   \n"
+      "sub         $0x20,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
+      : "+r"(src),                   // %0
+        "+r"(dst_u),                 // %1
+        "+r"(dst_v),                 // %2
+        "+r"(temp_width)             // %3
+      : "m"(kShuffleMirrorSplitUV),  // %4
+        "m"(kMirrorSplitUVPermute)   // %5
+      : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3");
+}
+#endif  // HAS_MIRRORSPLITUVROW_AVX512BW
+
+#ifdef HAS_MIRRORSPLITUVROW_AVX2
 void MirrorSplitUVRow_AVX2(const uint8_t* src,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
@@ -4668,16 +4740,16 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
 
 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
   ptrdiff_t temp_width = (ptrdiff_t)(width);
-      asm volatile("movdqa      %3,%%xmm5                     \n"
+  asm volatile("movdqa      %3,%%xmm5                     \n"
 
                LABELALIGN
-      "1:          \n"
-      "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
-      "pshufb      %%xmm5,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
+               "1:          \n"
+               "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
+               "pshufb      %%xmm5,%%xmm0                 \n"
+               "movdqu      %%xmm0,(%1)                   \n"
+               "lea         0x10(%1),%1                   \n"
+               "sub         $0x8,%2                       \n"
+               "jg          1b                            \n"
                : "+r"(src_uv),          // %0
                  "+r"(dst_uv),          // %1
                  "+r"(temp_width)       // %2
@@ -4689,18 +4761,18 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
 #ifdef HAS_MIRRORUVROW_AVX2
 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
   ptrdiff_t temp_width = (ptrdiff_t)(width);
-      asm volatile("vbroadcasti128 %3,%%ymm5                  \n"
+  asm volatile("vbroadcasti128 %3,%%ymm5                  \n"
 
                LABELALIGN
-      "1:          \n"
-      "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
-      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "1:          \n"
+               "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
+               "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+               "vpermq      $0x4e,%%ymm0,%%ymm0           \n"
+               "vmovdqu     %%ymm0,(%1)                   \n"
+               "lea         0x20(%1),%1                   \n"
+               "sub         $0x10,%2                      \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : "+r"(src_uv),          // %0
                  "+r"(dst_uv),          // %1
                  "+r"(temp_width)       // %2
@@ -4759,13 +4831,11 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
 #ifdef HAS_RGB24MIRRORROW_AVX2
 // Shuffle first 10 pixels to last 10 mirrored.  first byte zero
 static const uvec8 kShuffleMirrorRGB0_AVX = {
-    128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u
-};
+    128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u};
 
 // Shuffle last 2 pixels to first 2 mirrored.  last byte zero
 static const uvec8 kShuffleMirrorRGB1_AVX = {
-    13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u
-};
+    13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
 
 void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
                          uint8_t* dst_rgb24,
@@ -4801,9 +4871,9 @@ void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper  \n"
-      : "+r"(src_rgb24),          // %0
-        "+r"(dst_rgb24),          // %1
-        "+r"(temp_width)          // %2
+      : "+r"(src_rgb24),              // %0
+        "+r"(dst_rgb24),              // %1
+        "+r"(temp_width)              // %2
       : "m"(kShuffleMirrorRGB0_AVX),  // %3
         "m"(kShuffleMirrorRGB1_AVX)   // %4
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
@@ -4814,17 +4884,17 @@ void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
 
 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   ptrdiff_t temp_width = (ptrdiff_t)(width);
-      asm volatile("lea         -0x10(%0,%2,4),%0             \n"
+  asm volatile("lea         -0x10(%0,%2,4),%0             \n"
 
                LABELALIGN
-      "1:          \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
-      "lea         -0x10(%0),%0                  \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "sub         $0x4,%2                       \n"
-      "jg          1b                            \n"
+               "1:          \n"
+               "movdqu      (%0),%%xmm0                   \n"
+               "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
+               "lea         -0x10(%0),%0                  \n"
+               "movdqu      %%xmm0,(%1)                   \n"
+               "lea         0x10(%1),%1                   \n"
+               "sub         $0x4,%2                       \n"
+               "jg          1b                            \n"
                : "+r"(src),        // %0
                  "+r"(dst),        // %1
                  "+r"(temp_width)  // %2
@@ -4838,16 +4908,16 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   ptrdiff_t temp_width = (ptrdiff_t)(width);
-      asm volatile("vmovdqu     %3,%%ymm5                     \n"
+  asm volatile("vmovdqu     %3,%%ymm5                     \n"
 
                LABELALIGN
-      "1:          \n"
-      "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "1:          \n"
+               "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
+               "vmovdqu     %%ymm0,(%1)                   \n"
+               "lea         0x20(%1),%1                   \n"
+               "sub         $0x8,%2                       \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : "+r"(src),                    // %0
                  "+r"(dst),                    // %1
                  "+r"(temp_width)              // %2
@@ -4894,6 +4964,47 @@ void SplitUVRow_AVX2(const uint8_t* src_uv,
 }
 #endif  // HAS_SPLITUVROW_AVX2
 
+#ifdef HAS_SPLITUVROW_AVX512BW
+static const uint64_t kSplitUVPermute[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+void SplitUVRow_AVX512BW(const uint8_t* src_uv,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "vpternlogd  $0xff,%%zmm5,%%zmm5,%%zmm5    \n"
+      "vpsrlw      $0x8,%%zmm5,%%zmm5            \n"
+      "vmovdqu64   %4,%%zmm4                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:          \n"
+      "vmovdqu8    (%0),%%zmm0                   \n"
+      "vmovdqu8    0x40(%0),%%zmm1               \n"
+      "lea         0x80(%0),%0                   \n"
+      "vpsrlw      $0x8,%%zmm0,%%zmm2            \n"
+      "vpsrlw      $0x8,%%zmm1,%%zmm3            \n"
+      "vpandd      %%zmm5,%%zmm0,%%zmm0          \n"
+      "vpandd      %%zmm5,%%zmm1,%%zmm1          \n"
+      "vpackuswb   %%zmm1,%%zmm0,%%zmm0          \n"
+      "vpackuswb   %%zmm3,%%zmm2,%%zmm2          \n"
+      "vpermq      %%zmm0,%%zmm4,%%zmm0          \n"
+      "vpermq      %%zmm2,%%zmm4,%%zmm2          \n"
+      "vmovdqu8    %%zmm0,(%1)                   \n"
+      "vmovdqu8    %%zmm2,0x00(%1,%2,1)          \n"
+      "lea         0x40(%1),%1                   \n"
+      "sub         $0x40,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
+      : "+r"(src_uv),         // %0
+        "+r"(dst_u),          // %1
+        "+r"(dst_v),          // %2
+        "+r"(width)           // %3
+      : "m"(kSplitUVPermute)  // %4
+      : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5");
+}
+#endif  // HAS_SPLITUVROW_AVX512BW
+
 #ifdef HAS_SPLITUVROW_SSE2
 void SplitUVRow_SSE2(const uint8_t* src_uv,
                      uint8_t* dst_u,
@@ -5071,20 +5182,20 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u,
                          const uint8_t* src_v,
                          uint8_t* dst_uv,
                          int width) {
-      asm volatile("sub         %0,%1                         \n"
+  asm volatile("sub         %0,%1                         \n"
 
                LABELALIGN
-      "1:          \n"
-      "vpmovzxbw   (%0),%%zmm0                   \n"
-      "vpmovzxbw   0x00(%0,%1,1),%%zmm1          \n"
-      "lea         0x20(%0),%0                   \n"
-      "vpsllw      $0x8,%%zmm1,%%zmm1            \n"
-      "vporq       %%zmm0,%%zmm1,%%zmm2          \n"
-      "vmovdqu64   %%zmm2,(%2)                   \n"
-      "lea         0x40(%2),%2                   \n"
-      "sub         $0x20,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "1:          \n"
+               "vpmovzxbw   (%0),%%zmm0                   \n"
+               "vpmovzxbw   0x00(%0,%1,1),%%zmm1          \n"
+               "lea         0x20(%0),%0                   \n"
+               "vpsllw      $0x8,%%zmm1,%%zmm1            \n"
+               "vporq       %%zmm0,%%zmm1,%%zmm2          \n"
+               "vmovdqu64   %%zmm2,(%2)                   \n"
+               "lea         0x40(%2),%2                   \n"
+               "sub         $0x20,%3                      \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : "+r"(src_u),   // %0
                  "+r"(src_v),   // %1
                  "+r"(dst_uv),  // %2
@@ -5099,20 +5210,20 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-      asm volatile("sub         %0,%1                         \n"
+  asm volatile("sub         %0,%1                         \n"
 
                LABELALIGN
-      "1:          \n"
-      "vpmovzxbw   (%0),%%ymm0                   \n"
-      "vpmovzxbw   0x00(%0,%1,1),%%ymm1          \n"
-      "lea         0x10(%0),%0                   \n"
-      "vpsllw      $0x8,%%ymm1,%%ymm1            \n"
-      "vpor        %%ymm0,%%ymm1,%%ymm2          \n"
-      "vmovdqu     %%ymm2,(%2)                   \n"
-      "lea         0x20(%2),%2                   \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "1:          \n"
+               "vpmovzxbw   (%0),%%ymm0                   \n"
+               "vpmovzxbw   0x00(%0,%1,1),%%ymm1          \n"
+               "lea         0x10(%0),%0                   \n"
+               "vpsllw      $0x8,%%ymm1,%%ymm1            \n"
+               "vpor        %%ymm0,%%ymm1,%%ymm2          \n"
+               "vmovdqu     %%ymm2,(%2)                   \n"
+               "lea         0x20(%2),%2                   \n"
+               "sub         $0x10,%3                      \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : "+r"(src_u),   // %0
                  "+r"(src_v),   // %1
                  "+r"(dst_uv),  // %2
@@ -5127,21 +5238,21 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-      asm volatile("sub         %0,%1                         \n"
+  asm volatile("sub         %0,%1                         \n"
 
                LABELALIGN
-      "1:          \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x00(%0,%1,1),%%xmm1          \n"
-      "lea         0x10(%0),%0                   \n"
-      "movdqa      %%xmm0,%%xmm2                 \n"
-      "punpcklbw   %%xmm1,%%xmm0                 \n"
-      "punpckhbw   %%xmm1,%%xmm2                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"
-      "movdqu      %%xmm2,0x10(%2)               \n"
-      "lea         0x20(%2),%2                   \n"
-      "sub         $0x10,%3                      \n"
-      "jg          1b                            \n"
+               "1:          \n"
+               "movdqu      (%0),%%xmm0                   \n"
+               "movdqu      0x00(%0,%1,1),%%xmm1          \n"
+               "lea         0x10(%0),%0                   \n"
+               "movdqa      %%xmm0,%%xmm2                 \n"
+               "punpcklbw   %%xmm1,%%xmm0                 \n"
+               "punpckhbw   %%xmm1,%%xmm2                 \n"
+               "movdqu      %%xmm0,(%2)                   \n"
+               "movdqu      %%xmm2,0x10(%2)               \n"
+               "lea         0x20(%2),%2                   \n"
+               "sub         $0x10,%3                      \n"
+               "jg          1b                            \n"
                : "+r"(src_u),   // %0
                  "+r"(src_v),   // %1
                  "+r"(dst_uv),  // %2
@@ -5376,24 +5487,24 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y,
                               uint8_t* dst_y,
                               int scale,
                               int width) {
-      asm volatile("vpbroadcastw %3,%%zmm2                    \n"
+  asm volatile("vpbroadcastw %3,%%zmm2                    \n"
 
                // 64 pixels per loop.
                LABELALIGN
-      "1:          \n"
-      "vmovups     (%0),%%zmm0                   \n"
-      "vmovups     0x40(%0),%%zmm1               \n"
-      "add         $0x80,%0                      \n"
-      "vpmulhuw    %%zmm2,%%zmm0,%%zmm0          \n"
-      "vpmulhuw    %%zmm2,%%zmm1,%%zmm1          \n"
-      "vpmovuswb   %%zmm0,%%ymm0                 \n"
-      "vpmovuswb   %%zmm1,%%ymm1                 \n"
-      "vmovups     %%ymm0,(%1)                   \n"
-      "vmovups     %%ymm1,0x20(%1)               \n"
-      "add         $0x40,%1                      \n"
-      "sub         $0x40,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "1:          \n"
+               "vmovups     (%0),%%zmm0                   \n"
+               "vmovups     0x40(%0),%%zmm1               \n"
+               "add         $0x80,%0                      \n"
+               "vpmulhuw    %%zmm2,%%zmm0,%%zmm0          \n"
+               "vpmulhuw    %%zmm2,%%zmm1,%%zmm1          \n"
+               "vpmovuswb   %%zmm0,%%ymm0                 \n"
+               "vpmovuswb   %%zmm1,%%ymm1                 \n"
+               "vmovups     %%ymm0,(%1)                   \n"
+               "vmovups     %%ymm1,0x20(%1)               \n"
+               "add         $0x40,%1                      \n"
+               "sub         $0x40,%2                      \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : "+r"(src_y),  // %0
                  "+r"(dst_y),  // %1
                  "+r"(width)   // %2
@@ -5443,24 +5554,24 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
                           int scale,
                           int width) {
   const int shift = __builtin_clz(scale) - 15;
-      asm volatile("vmovd       %3,%%xmm2                     \n"
+  asm volatile("vmovd       %3,%%xmm2                     \n"
 
                // 32 pixels per loop.
                LABELALIGN
-      "1:          \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "add         $0x20,%0                      \n"
-      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
-      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpsrlw      %%xmm2,%%ymm0,%%ymm0          \n"
-      "vpsrlw      %%xmm2,%%ymm1,%%ymm1          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "add         $0x40,%1                      \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "1:          \n"
+               "vmovdqu     (%0),%%ymm0                   \n"
+               "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+               "add         $0x20,%0                      \n"
+               "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
+               "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
+               "vpsrlw      %%xmm2,%%ymm0,%%ymm0          \n"
+               "vpsrlw      %%xmm2,%%ymm1,%%ymm1          \n"
+               "vmovdqu     %%ymm0,(%1)                   \n"
+               "vmovdqu     %%ymm1,0x20(%1)               \n"
+               "add         $0x40,%1                      \n"
+               "sub         $0x20,%2                      \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : "+r"(src_y),  // %0
                  "+r"(dst_y),  // %1
                  "+r"(width)   // %2
@@ -6241,7 +6352,7 @@ void MergeXR30Row_AVX2(const uint16_t* src_r,
 #if defined(__i386__)
       : "m"(shift)  // %5
 #else
-      : "rm"(shift)   // %5
+      : "rm"(shift)  // %5
 #endif
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
@@ -6577,7 +6688,7 @@ void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) {
 // Multiple of 1.
 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
   size_t width_tmp = (size_t)(width);
-      asm volatile("rep         movsb                         \n"
+  asm volatile("rep         movsb                         \n"
                : "+S"(src),       // %0
                  "+D"(dst),       // %1
                  "+c"(width_tmp)  // %2
@@ -6787,7 +6898,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width >> 2);
   const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
-      asm volatile("rep         stosl                         \n"
+  asm volatile("rep         stosl                         \n"
                : "+D"(dst),       // %0
                  "+c"(width_tmp)  // %1
                : "a"(v32)         // %2
@@ -6796,7 +6907,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
 
 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width);
-      asm volatile("rep         stosb                         \n"
+  asm volatile("rep         stosb                         \n"
                : "+D"(dst),       // %0
                  "+c"(width_tmp)  // %1
                : "a"(v8)          // %2
@@ -6805,7 +6916,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
 
 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
   size_t width_tmp = (size_t)(width);
-      asm volatile("rep         stosl                         \n"
+  asm volatile("rep         stosl                         \n"
                : "+D"(dst_argb),  // %0
                  "+c"(width_tmp)  // %1
                : "a"(v32)         // %2
@@ -7966,28 +8077,28 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-      asm volatile("pxor        %%xmm5,%%xmm5                 \n"
+  asm volatile("pxor        %%xmm5,%%xmm5                 \n"
 
                // 4 pixel loop.
                LABELALIGN
-      "1:          \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "lea         0x10(%0),%0                   \n"
-      "movdqu      (%1),%%xmm2                   \n"
-      "lea         0x10(%1),%1                   \n"
-      "movdqu      %%xmm0,%%xmm1                 \n"
-      "movdqu      %%xmm2,%%xmm3                 \n"
-      "punpcklbw   %%xmm0,%%xmm0                 \n"
-      "punpckhbw   %%xmm1,%%xmm1                 \n"
-      "punpcklbw   %%xmm5,%%xmm2                 \n"
-      "punpckhbw   %%xmm5,%%xmm3                 \n"
-      "pmulhuw     %%xmm2,%%xmm0                 \n"
-      "pmulhuw     %%xmm3,%%xmm1                 \n"
-      "packuswb    %%xmm1,%%xmm0                 \n"
-      "movdqu      %%xmm0,(%2)                   \n"
-      "lea         0x10(%2),%2                   \n"
-      "sub         $0x4,%3                       \n"
-      "jg          1b                            \n"
+               "1:          \n"
+               "movdqu      (%0),%%xmm0                   \n"
+               "lea         0x10(%0),%0                   \n"
+               "movdqu      (%1),%%xmm2                   \n"
+               "lea         0x10(%1),%1                   \n"
+               "movdqu      %%xmm0,%%xmm1                 \n"
+               "movdqu      %%xmm2,%%xmm3                 \n"
+               "punpcklbw   %%xmm0,%%xmm0                 \n"
+               "punpckhbw   %%xmm1,%%xmm1                 \n"
+               "punpcklbw   %%xmm5,%%xmm2                 \n"
+               "punpckhbw   %%xmm5,%%xmm3                 \n"
+               "pmulhuw     %%xmm2,%%xmm0                 \n"
+               "pmulhuw     %%xmm3,%%xmm1                 \n"
+               "packuswb    %%xmm1,%%xmm0                 \n"
+               "movdqu      %%xmm0,(%2)                   \n"
+               "lea         0x10(%2),%2                   \n"
+               "sub         $0x4,%3                       \n"
+               "jg          1b                            \n"
                : "+r"(src_argb),   // %0
                  "+r"(src_argb1),  // %1
                  "+r"(dst_argb),   // %2
@@ -8003,27 +8114,27 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+  asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
                // 4 pixel loop.
                LABELALIGN
-      "1:          \n"
-      "vmovdqu     (%0),%%ymm1                   \n"
-      "lea         0x20(%0),%0                   \n"
-      "vmovdqu     (%1),%%ymm3                   \n"
-      "lea         0x20(%1),%1                   \n"
-      "vpunpcklbw  %%ymm1,%%ymm1,%%ymm0          \n"
-      "vpunpckhbw  %%ymm1,%%ymm1,%%ymm1          \n"
-      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
-      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
-      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
-      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
-      "vmovdqu     %%ymm0,(%2)                   \n"
-      "lea         0x20(%2),%2                   \n"
-      "sub         $0x8,%3                       \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "1:          \n"
+               "vmovdqu     (%0),%%ymm1                   \n"
+               "lea         0x20(%0),%0                   \n"
+               "vmovdqu     (%1),%%ymm3                   \n"
+               "lea         0x20(%1),%1                   \n"
+               "vpunpcklbw  %%ymm1,%%ymm1,%%ymm0          \n"
+               "vpunpckhbw  %%ymm1,%%ymm1,%%ymm1          \n"
+               "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
+               "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
+               "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+               "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+               "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+               "vmovdqu     %%ymm0,(%2)                   \n"
+               "lea         0x20(%2),%2                   \n"
+               "sub         $0x8,%3                       \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : "+r"(src_argb),   // %0
                  "+r"(src_argb1),  // %1
                  "+r"(dst_argb),   // %2
@@ -8783,10 +8894,14 @@ void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
       "vmovd       %3,%%xmm5                     \n"
       "vpunpcklwd  %%xmm0,%%xmm5,%%xmm5          \n"
       "vpbroadcastd %%xmm5,%%ymm5                \n"
-      "mov         $0x80008000,%%eax             \n"  // 0x80008000 used to bias unsigned words to signed range for vpmaddwd.
+      "mov         $0x80008000,%%eax             \n"  // 0x80008000 used to bias
+                                                      // unsigned words to
+                                                      // signed range for
+                                                      // vpmaddwd.
       "vmovd       %%eax,%%xmm4                  \n"
       "vbroadcastss %%xmm4,%%ymm4                \n"
-      "mov         $8388736,%%eax                \n"  // 32768 * 256 + 128 rounding constant.
+      "mov         $8388736,%%eax                \n"  // 32768 * 256 + 128
+                                                      // rounding constant.
       "vmovd       %%eax,%%xmm3                  \n"
       "vbroadcastss %%xmm3,%%ymm3                \n"
 
@@ -8811,8 +8926,7 @@ void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
       "jg          1b                            \n"
       "jmp         99f                           \n"
 
-      "50:         \n"
-      LABELALIGN
+      "50:         \n" LABELALIGN
       "2:          \n"
       "vmovdqu     (%1),%%ymm0                   \n"
       "vpavgw      (%1,%4,2),%%ymm0,%%ymm0       \n"
@@ -8822,8 +8936,7 @@ void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
       "jg          2b                            \n"
       "jmp         99f                           \n"
 
-      "100:        \n"
-      LABELALIGN
+      "100:        \n" LABELALIGN
       "3:          \n"
       "vmovdqu     (%1),%%ymm0                   \n"
       "vmovdqu     %%ymm0,0x00(%1,%0,1)          \n"
@@ -8832,7 +8945,7 @@ void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
       "jg          3b                            \n"
 
       "99:         \n"
-      "vzeroupper                                \n"
+      "vzeroupper  \n"
       : "+r"(dst_ptr),           // %0
         "+r"(src_ptr),           // %1
         "+r"(width),             // %2
@@ -8848,20 +8961,20 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           const uint8_t* shuffler,
                           int width) {
-      asm volatile("movdqu      (%3),%%xmm5                   \n"
+  asm volatile("movdqu      (%3),%%xmm5                   \n"
 
                LABELALIGN
-      "1:          \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "pshufb      %%xmm5,%%xmm0                 \n"
-      "pshufb      %%xmm5,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x8,%2                       \n"
-      "jg          1b                            \n"
+               "1:          \n"
+               "movdqu      (%0),%%xmm0                   \n"
+               "movdqu      0x10(%0),%%xmm1               \n"
+               "lea         0x20(%0),%0                   \n"
+               "pshufb      %%xmm5,%%xmm0                 \n"
+               "pshufb      %%xmm5,%%xmm1                 \n"
+               "movdqu      %%xmm0,(%1)                   \n"
+               "movdqu      %%xmm1,0x10(%1)               \n"
+               "lea         0x20(%1),%1                   \n"
+               "sub         $0x8,%2                       \n"
+               "jg          1b                            \n"
                : "+r"(src_argb),  // %0
                  "+r"(dst_argb),  // %1
                  "+r"(width)      // %2
@@ -8876,21 +8989,21 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
                          int width) {
-      asm volatile("vbroadcasti128 (%3),%%ymm5                \n"
+  asm volatile("vbroadcasti128 (%3),%%ymm5                \n"
 
                LABELALIGN
-      "1:          \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "1:          \n"
+               "vmovdqu     (%0),%%ymm0                   \n"
+               "vmovdqu     0x20(%0),%%ymm1               \n"
+               "lea         0x40(%0),%0                   \n"
+               "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+               "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
+               "vmovdqu     %%ymm0,(%1)                   \n"
+               "vmovdqu     %%ymm1,0x20(%1)               \n"
+               "lea         0x40(%1),%1                   \n"
+               "sub         $0x10,%2                      \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : "+r"(src_argb),  // %0
                  "+r"(dst_argb),  // %1
                  "+r"(width)      // %2
@@ -8905,27 +9018,26 @@ void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              const uint8_t* shuffler,
                              int width) {
-  asm volatile(
-      "vbroadcasti32x4 (%3),%%zmm5               \n"
+  asm volatile("vbroadcasti32x4 (%3),%%zmm5               \n"
 
-      LABELALIGN
-      "1:          \n"
-      "vmovdqu8    (%0),%%zmm0                   \n"
-      "vmovdqu8    0x40(%0),%%zmm1               \n"
-      "lea         0x80(%0),%0                   \n"
-      "vpshufb     %%zmm5,%%zmm0,%%zmm0          \n"
-      "vpshufb     %%zmm5,%%zmm1,%%zmm1          \n"
-      "vmovdqu8    %%zmm0,(%1)                   \n"
-      "vmovdqu8    %%zmm1,0x40(%1)               \n"
-      "lea         0x80(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(shuffler)    // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+               LABELALIGN
+               "1:          \n"
+               "vmovdqu8    (%0),%%zmm0                   \n"
+               "vmovdqu8    0x40(%0),%%zmm1               \n"
+               "lea         0x80(%0),%0                   \n"
+               "vpshufb     %%zmm5,%%zmm0,%%zmm0          \n"
+               "vpshufb     %%zmm5,%%zmm1,%%zmm1          \n"
+               "vmovdqu8    %%zmm0,(%1)                   \n"
+               "vmovdqu8    %%zmm1,0x40(%1)               \n"
+               "lea         0x80(%1),%1                   \n"
+               "sub         $0x20,%2                      \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
+               : "+r"(src_argb),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(width)      // %2
+               : "r"(shuffler)    // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX512BW
 
@@ -8935,24 +9047,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_yuy2,
                         int width) {
-      asm volatile("sub         %1,%2                         \n"
+  asm volatile("sub         %1,%2                         \n"
 
                LABELALIGN
-      "1:          \n"
-      "movq        (%1),%%xmm2                   \n"
-      "movq        0x00(%1,%2,1),%%xmm1          \n"
-      "add         $0x8,%1                       \n"
-      "punpcklbw   %%xmm1,%%xmm2                 \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "add         $0x10,%0                      \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "punpcklbw   %%xmm2,%%xmm0                 \n"
-      "punpckhbw   %%xmm2,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%3)                   \n"
-      "movdqu      %%xmm1,0x10(%3)               \n"
-      "lea         0x20(%3),%3                   \n"
-      "sub         $0x10,%4                      \n"
-      "jg          1b                            \n"
+               "1:          \n"
+               "movq        (%1),%%xmm2                   \n"
+               "movq        0x00(%1,%2,1),%%xmm1          \n"
+               "add         $0x8,%1                       \n"
+               "punpcklbw   %%xmm1,%%xmm2                 \n"
+               "movdqu      (%0),%%xmm0                   \n"
+               "add         $0x10,%0                      \n"
+               "movdqa      %%xmm0,%%xmm1                 \n"
+               "punpcklbw   %%xmm2,%%xmm0                 \n"
+               "punpckhbw   %%xmm2,%%xmm1                 \n"
+               "movdqu      %%xmm0,(%3)                   \n"
+               "movdqu      %%xmm1,0x10(%3)               \n"
+               "lea         0x20(%3),%3                   \n"
+               "sub         $0x10,%4                      \n"
+               "jg          1b                            \n"
                : "+r"(src_y),     // %0
                  "+r"(src_u),     // %1
                  "+r"(src_v),     // %2
@@ -8969,24 +9081,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_uyvy,
                         int width) {
-      asm volatile("sub         %1,%2                         \n"
+  asm volatile("sub         %1,%2                         \n"
 
                LABELALIGN
-      "1:          \n"
-      "movq        (%1),%%xmm2                   \n"
-      "movq        0x00(%1,%2,1),%%xmm1          \n"
-      "add         $0x8,%1                       \n"
-      "punpcklbw   %%xmm1,%%xmm2                 \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqa      %%xmm2,%%xmm1                 \n"
-      "add         $0x10,%0                      \n"
-      "punpcklbw   %%xmm0,%%xmm1                 \n"
-      "punpckhbw   %%xmm0,%%xmm2                 \n"
-      "movdqu      %%xmm1,(%3)                   \n"
-      "movdqu      %%xmm2,0x10(%3)               \n"
-      "lea         0x20(%3),%3                   \n"
-      "sub         $0x10,%4                      \n"
-      "jg          1b                            \n"
+               "1:          \n"
+               "movq        (%1),%%xmm2                   \n"
+               "movq        0x00(%1,%2,1),%%xmm1          \n"
+               "add         $0x8,%1                       \n"
+               "punpcklbw   %%xmm1,%%xmm2                 \n"
+               "movdqu      (%0),%%xmm0                   \n"
+               "movdqa      %%xmm2,%%xmm1                 \n"
+               "add         $0x10,%0                      \n"
+               "punpcklbw   %%xmm0,%%xmm1                 \n"
+               "punpckhbw   %%xmm0,%%xmm2                 \n"
+               "movdqu      %%xmm1,(%3)                   \n"
+               "movdqu      %%xmm2,0x10(%3)               \n"
+               "lea         0x20(%3),%3                   \n"
+               "sub         $0x10,%4                      \n"
+               "jg          1b                            \n"
                : "+r"(src_y),     // %0
                  "+r"(src_u),     // %1
                  "+r"(src_v),     // %2
@@ -9003,27 +9115,27 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_yuy2,
                         int width) {
-      asm volatile("sub         %1,%2                         \n"
+  asm volatile("sub         %1,%2                         \n"
 
                LABELALIGN
-      "1:          \n"
-      "vpmovzxbw   (%1),%%ymm1                   \n"
-      "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
-      "add         $0x10,%1                      \n"
-      "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
-      "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "add         $0x20,%0                      \n"
-      "vpunpcklbw  %%ymm2,%%ymm0,%%ymm1          \n"
-      "vpunpckhbw  %%ymm2,%%ymm0,%%ymm2          \n"
-      "vextractf128 $0x0,%%ymm1,(%3)             \n"
-      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
-      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
-      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
-      "lea         0x40(%3),%3                   \n"
-      "sub         $0x20,%4                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "1:          \n"
+               "vpmovzxbw   (%1),%%ymm1                   \n"
+               "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
+               "add         $0x10,%1                      \n"
+               "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
+               "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
+               "vmovdqu     (%0),%%ymm0                   \n"
+               "add         $0x20,%0                      \n"
+               "vpunpcklbw  %%ymm2,%%ymm0,%%ymm1          \n"
+               "vpunpckhbw  %%ymm2,%%ymm0,%%ymm2          \n"
+               "vextractf128 $0x0,%%ymm1,(%3)             \n"
+               "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
+               "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
+               "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
+               "lea         0x40(%3),%3                   \n"
+               "sub         $0x20,%4                      \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : "+r"(src_y),     // %0
                  "+r"(src_u),     // %1
                  "+r"(src_v),     // %2
@@ -9040,27 +9152,27 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_uyvy,
                         int width) {
-      asm volatile("sub         %1,%2                         \n"
+  asm volatile("sub         %1,%2                         \n"
 
                LABELALIGN
-      "1:          \n"
-      "vpmovzxbw   (%1),%%ymm1                   \n"
-      "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
-      "add         $0x10,%1                      \n"
-      "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
-      "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "add         $0x20,%0                      \n"
-      "vpunpcklbw  %%ymm0,%%ymm2,%%ymm1          \n"
-      "vpunpckhbw  %%ymm0,%%ymm2,%%ymm2          \n"
-      "vextractf128 $0x0,%%ymm1,(%3)             \n"
-      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
-      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
-      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
-      "lea         0x40(%3),%3                   \n"
-      "sub         $0x20,%4                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "1:          \n"
+               "vpmovzxbw   (%1),%%ymm1                   \n"
+               "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
+               "add         $0x10,%1                      \n"
+               "vpsllw      $0x8,%%ymm2,%%ymm2            \n"
+               "vpor        %%ymm1,%%ymm2,%%ymm2          \n"
+               "vmovdqu     (%0),%%ymm0                   \n"
+               "add         $0x20,%0                      \n"
+               "vpunpcklbw  %%ymm0,%%ymm2,%%ymm1          \n"
+               "vpunpckhbw  %%ymm0,%%ymm2,%%ymm2          \n"
+               "vextractf128 $0x0,%%ymm1,(%3)             \n"
+               "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
+               "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
+               "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
+               "lea         0x40(%3),%3                   \n"
+               "sub         $0x20,%4                      \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : "+r"(src_y),     // %0
                  "+r"(src_u),     // %1
                  "+r"(src_v),     // %2
@@ -9076,47 +9188,47 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const float* poly,
                             int width) {
-      asm volatile("pxor        %%xmm3,%%xmm3                 \n"
+  asm volatile("pxor        %%xmm3,%%xmm3                 \n"
 
                // 2 pixel loop.
                LABELALIGN
-      "1:          \n"
-      "movq        (%0),%%xmm0                   \n"
-      "lea         0x8(%0),%0                    \n"
-      "punpcklbw   %%xmm3,%%xmm0                 \n"
-      "movdqa      %%xmm0,%%xmm4                 \n"
-      "punpcklwd   %%xmm3,%%xmm0                 \n"
-      "punpckhwd   %%xmm3,%%xmm4                 \n"
-      "cvtdq2ps    %%xmm0,%%xmm0                 \n"
-      "cvtdq2ps    %%xmm4,%%xmm4                 \n"
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm4,%%xmm5                 \n"
-      "mulps       0x10(%3),%%xmm0               \n"
-      "mulps       0x10(%3),%%xmm4               \n"
-      "addps       (%3),%%xmm0                   \n"
-      "addps       (%3),%%xmm4                   \n"
-      "movdqa      %%xmm1,%%xmm2                 \n"
-      "movdqa      %%xmm5,%%xmm6                 \n"
-      "mulps       %%xmm1,%%xmm2                 \n"
-      "mulps       %%xmm5,%%xmm6                 \n"
-      "mulps       %%xmm2,%%xmm1                 \n"
-      "mulps       %%xmm6,%%xmm5                 \n"
-      "mulps       0x20(%3),%%xmm2               \n"
-      "mulps       0x20(%3),%%xmm6               \n"
-      "mulps       0x30(%3),%%xmm1               \n"
-      "mulps       0x30(%3),%%xmm5               \n"
-      "addps       %%xmm2,%%xmm0                 \n"
-      "addps       %%xmm6,%%xmm4                 \n"
-      "addps       %%xmm1,%%xmm0                 \n"
-      "addps       %%xmm5,%%xmm4                 \n"
-      "cvttps2dq   %%xmm0,%%xmm0                 \n"
-      "cvttps2dq   %%xmm4,%%xmm4                 \n"
-      "packuswb    %%xmm4,%%xmm0                 \n"
-      "packuswb    %%xmm0,%%xmm0                 \n"
-      "movq        %%xmm0,(%1)                   \n"
-      "lea         0x8(%1),%1                    \n"
-      "sub         $0x2,%2                       \n"
-      "jg          1b                            \n"
+               "1:          \n"
+               "movq        (%0),%%xmm0                   \n"
+               "lea         0x8(%0),%0                    \n"
+               "punpcklbw   %%xmm3,%%xmm0                 \n"
+               "movdqa      %%xmm0,%%xmm4                 \n"
+               "punpcklwd   %%xmm3,%%xmm0                 \n"
+               "punpckhwd   %%xmm3,%%xmm4                 \n"
+               "cvtdq2ps    %%xmm0,%%xmm0                 \n"
+               "cvtdq2ps    %%xmm4,%%xmm4                 \n"
+               "movdqa      %%xmm0,%%xmm1                 \n"
+               "movdqa      %%xmm4,%%xmm5                 \n"
+               "mulps       0x10(%3),%%xmm0               \n"
+               "mulps       0x10(%3),%%xmm4               \n"
+               "addps       (%3),%%xmm0                   \n"
+               "addps       (%3),%%xmm4                   \n"
+               "movdqa      %%xmm1,%%xmm2                 \n"
+               "movdqa      %%xmm5,%%xmm6                 \n"
+               "mulps       %%xmm1,%%xmm2                 \n"
+               "mulps       %%xmm5,%%xmm6                 \n"
+               "mulps       %%xmm2,%%xmm1                 \n"
+               "mulps       %%xmm6,%%xmm5                 \n"
+               "mulps       0x20(%3),%%xmm2               \n"
+               "mulps       0x20(%3),%%xmm6               \n"
+               "mulps       0x30(%3),%%xmm1               \n"
+               "mulps       0x30(%3),%%xmm5               \n"
+               "addps       %%xmm2,%%xmm0                 \n"
+               "addps       %%xmm6,%%xmm4                 \n"
+               "addps       %%xmm1,%%xmm0                 \n"
+               "addps       %%xmm5,%%xmm4                 \n"
+               "cvttps2dq   %%xmm0,%%xmm0                 \n"
+               "cvttps2dq   %%xmm4,%%xmm4                 \n"
+               "packuswb    %%xmm4,%%xmm0                 \n"
+               "packuswb    %%xmm0,%%xmm0                 \n"
+               "movq        %%xmm0,(%1)                   \n"
+               "lea         0x8(%1),%1                    \n"
+               "sub         $0x2,%2                       \n"
+               "jg          1b                            \n"
                : "+r"(src_argb),  // %0
                  "+r"(dst_argb),  // %1
                  "+r"(width)      // %2
@@ -9204,7 +9316,7 @@ void HalfFloatRow_AVX2(const uint16_t* src,
 #if defined(__x86_64__)
       : "x"(scale)  // %3
 #else
-      : "m"(scale)    // %3
+      : "m"(scale)  // %3
 #endif
       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
 }
@@ -9242,7 +9354,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
 #if defined(__x86_64__)
       : "x"(scale)  // %3
 #else
-      : "m"(scale)    // %3
+      : "m"(scale)  // %3
 #endif
       : "memory", "cc", "xmm2", "xmm3", "xmm4");
 }
@@ -9576,20 +9688,20 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
 
 // Convert UV plane of NV12 to VU of NV21.
 void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-      asm volatile("movdqu      %3,%%xmm5                     \n"
+  asm volatile("movdqu      %3,%%xmm5                     \n"
 
                LABELALIGN
-      "1:          \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "lea         0x20(%0),%0                   \n"
-      "pshufb      %%xmm5,%%xmm0                 \n"
-      "pshufb      %%xmm5,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
+               "1:          \n"
+               "movdqu      (%0),%%xmm0                   \n"
+               "movdqu      0x10(%0),%%xmm1               \n"
+               "lea         0x20(%0),%0                   \n"
+               "pshufb      %%xmm5,%%xmm0                 \n"
+               "pshufb      %%xmm5,%%xmm1                 \n"
+               "movdqu      %%xmm0,(%1)                   \n"
+               "movdqu      %%xmm1,0x10(%1)               \n"
+               "lea         0x20(%1),%1                   \n"
+               "sub         $0x10,%2                      \n"
+               "jg          1b                            \n"
                : "+r"(src_uv),        // %0
                  "+r"(dst_vu),        // %1
                  "+r"(width)          // %2
@@ -9600,21 +9712,21 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
 
 #ifdef HAS_SWAPUVROW_AVX2
 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-      asm volatile("vbroadcasti128 %3,%%ymm5                  \n"
+  asm volatile("vbroadcasti128 %3,%%ymm5                  \n"
 
                LABELALIGN
-      "1:          \n"
-      "vmovdqu     (%0),%%ymm0                   \n"
-      "vmovdqu     0x20(%0),%%ymm1               \n"
-      "lea         0x40(%0),%0                   \n"
-      "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
-      "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "1:          \n"
+               "vmovdqu     (%0),%%ymm0                   \n"
+               "vmovdqu     0x20(%0),%%ymm1               \n"
+               "lea         0x40(%0),%0                   \n"
+               "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
+               "vpshufb     %%ymm5,%%ymm1,%%ymm1          \n"
+               "vmovdqu     %%ymm0,(%1)                   \n"
+               "vmovdqu     %%ymm1,0x20(%1)               \n"
+               "lea         0x40(%1),%1                   \n"
+               "sub         $0x20,%2                      \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : "+r"(src_uv),        // %0
                  "+r"(dst_vu),        // %1
                  "+r"(width)          // %2
diff --git a/source/row_lasx.cc b/source/row_lasx.cc
index 94cb44ed1..e0802c15e 100644
--- a/source/row_lasx.cc
+++ b/source/row_lasx.cc
@@ -2027,10 +2027,12 @@ struct ArgbConstants {
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
 static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
-                                                        128,
-                                                        0};
+                                                         128,
+                                                         0};
 
-static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
+                                                       128,
+                                                       0};
 
 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@@ -2039,19 +2041,19 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}
 // Add 16.5 = 0x1080
 
 static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                        0x1080,
-                                                        0};
+                                                         0x1080,
+                                                         0};
 
 static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0},
-                                                      0x1080,
-                                                      0};
+                                                       0x1080,
+                                                       0};
 #endif  // ArgbConstants
 
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
-                                  uint8_t* dst_y,
-                                  int width,
-                                  const struct ArgbConstants* c) {
+                           uint8_t* dst_y,
+                           int width,
+                           const struct ArgbConstants* c) {
   int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
   asm volatile(
       "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
@@ -2216,18 +2218,14 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
       "xvst            $xr10, %1,    0             \n\t"
       "addi.d          %1,    %1,    32            \n\t"
       "bnez            %2,    1b                   \n\t"
-      : "+&r"(src_rgba),    // %0
-        "+&r"(dst_y),       // %1
-        "+&r"(width)        // %2
-      : "r"(c),  // %3
-        "r"(shuff)          // %4
+      : "+&r"(src_rgba),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(c),           // %3
+        "r"(shuff)        // %4
       : "memory");
 }
 
-
-
-
-
 void ARGBToUVJRow_LASX(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
diff --git a/source/row_lsx.cc b/source/row_lsx.cc
index 41689578a..3e6d5154c 100644
--- a/source/row_lsx.cc
+++ b/source/row_lsx.cc
@@ -2812,10 +2812,12 @@ struct ArgbConstants {
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
 static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
-                                                        128,
-                                                        0};
+                                                         128,
+                                                         0};
 
-static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
+                                                       128,
+                                                       0};
 
 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@@ -2824,19 +2826,19 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}
 // Add 16.5 = 0x1080
 
 static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                        0x1080,
-                                                        0};
+                                                         0x1080,
+                                                         0};
 
 static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0},
-                                                      0x1080,
-                                                      0};
+                                                       0x1080,
+                                                       0};
 #endif  // ArgbConstants
 
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
-                                 uint8_t* dst_y,
-                                 int width,
-                                 const struct ArgbConstants* c) {
+                          uint8_t* dst_y,
+                          int width,
+                          const struct ArgbConstants* c) {
   asm volatile(
       "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
       "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
@@ -2987,18 +2989,14 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
       "vst            $vr10, %1,    0             \n\t"
       "addi.d         %1,    %1,    16            \n\t"
       "bnez           %2,    1b                   \n\t"
-      : "+&r"(src_rgba),    // %0
-        "+&r"(dst_y),       // %1
-        "+&r"(width)        // %2
-      : "r"(c),  // %3
-        "r"(shuff)          // %4
+      : "+&r"(src_rgba),  // %0
+        "+&r"(dst_y),     // %1
+        "+&r"(width)      // %2
+      : "r"(c),           // %3
+        "r"(shuff)        // %4
       : "memory");
 }
 
-
-
-
-
 // undef for unified sources build
 #undef YUVTORGB_SETUP
 #undef READYUV422_D
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 62644a321..08608005f 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/convert_from_argb.h"  // For ArgbConstants
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -272,7 +272,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
       "subs        %[width], %[width], #8        \n"  //
       YUVTORGB                                        //
           RGBTORGB8                                   //
-              STORERGBA                               //
+      STORERGBA                                       //
       "bgt         1b                            \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
         [src_u] "+r"(src_u),                               // %[src_u]
@@ -325,9 +325,8 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:          \n"  //
-      READYUV422
-      "subs        %[width], %[width], #8        \n" YUVTORGB RGBTORGB8
-          ARGBTORGB565
+      READYUV422 "subs        %[width], %[width], #8        \n" YUVTORGB
+          RGBTORGB8 ARGBTORGB565
       "vst1.8      {q2}, [%[dst_rgb565]]!        \n"  // store 8 pixels RGB565.
       "bgt         1b                            \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
@@ -1887,13 +1886,13 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
       "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels U.
       "vst1.8      {d1}, [%2]!                   \n"  // store 8 pixels V.
       "bgt         1b                            \n"
-      : "+r"(src_argb),     // %0
-        "+r"(dst_u),        // %1
-        "+r"(dst_v),        // %2
-        "+r"(width)         // %3
-      : "r"(&c->kRGBToU),   // %4
-        "r"(&c->kRGBToV),   // %5
-        "r"(&c->kAddUV)     // %6
+      : "+r"(src_argb),    // %0
+        "+r"(dst_u),       // %1
+        "+r"(dst_v),       // %2
+        "+r"(width)        // %3
+      : "r"(&c->kRGBToU),  // %4
+        "r"(&c->kRGBToV),  // %5
+        "r"(&c->kAddUV)    // %6
       : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
         "q10", "q11", "q12");
 }
@@ -1912,7 +1911,6 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
   ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbJPEGConstants);
 }
 
-
 // clang-format off
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
 #define RGBTOUV(QB, QG, QR)                                                 \
@@ -1934,8 +1932,9 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
                             int width,
                             const struct ArgbConstants* c) {
   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile (
-      "vld1.8      {d24}, [%5]                   \n"  // load kRGBToU (8 bytes, only 4 used)
+  asm volatile(
+      "vld1.8      {d24}, [%5]                   \n"  // load kRGBToU (8 bytes,
+                                                      // only 4 used)
       "vld1.8      {d25}, [%6]                   \n"  // load kRGBToV
       "vmovl.s8    q14, d24                      \n"  // U coeffs in d28
       "vmovl.s8    q15, d25                      \n"  // V coeffs in d30
@@ -1943,7 +1942,8 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
 
       "1:          \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
+                                                      // pixels.
       "subs        %4, %4, #16                   \n"  // 16 processed per loop.
       "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
       "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
@@ -1985,16 +1985,15 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
       "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
       "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
       "bgt         1b                            \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  : "r"(&c->kRGBToU),  // %5
-    "r"(&c->kRGBToV)   // %6
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q11", "q12", "q14", "q15"
-  );
+      : "+r"(src_argb),    // %0
+        "+r"(src_argb_1),  // %1
+        "+r"(dst_u),       // %2
+        "+r"(dst_v),       // %3
+        "+r"(width)        // %4
+      : "r"(&c->kRGBToU),  // %5
+        "r"(&c->kRGBToV)   // %6
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q11", "q12", "q14", "q15");
 }
 
 void ARGBToUVRow_NEON(const uint8_t* src_argb,
@@ -2704,9 +2703,9 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
 
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
-                            uint8_t* dst_y,
-                            int width,
-                            const struct ArgbConstants* c) {
+                           uint8_t* dst_y,
+                           int width,
+                           const struct ArgbConstants* c) {
   asm volatile(
       "vld1.8      {d24}, [%3]                   \n"  // load kRGBToY
       "vld1.16     {d25[0]}, [%4]                \n"  // load kAddY[0]
@@ -2773,9 +2772,9 @@ void BGRAToYJRow_NEON(const uint8_t* src_bgra, uint8_t* dst_yj, int width) {
 }
 
 void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
-                                 uint8_t* dst_y,
-                                 int width,
-                                 const struct ArgbConstants* c) {
+                          uint8_t* dst_y,
+                          int width,
+                          const struct ArgbConstants* c) {
   asm volatile(
       "vld1.8      {d24}, [%3]                   \n"  // load kRGBToY
       "vld1.16     {d25[0]}, [%4]                \n"  // load kAddY[0]
@@ -2807,10 +2806,6 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
         "d24", "d25");
 }
 
-
-
-
-
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_NEON(uint8_t* dst_ptr,
                          const uint8_t* src_ptr,
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 65d7b65a5..f90b4a18b 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/convert_from_argb.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -292,12 +292,12 @@ void I210ToAR30Row_NEON(const uint16_t* src_y,
   uint16_t limit = 0x3ff0;
   uint16_t alpha = 0xc000;
   asm volatile(YUVTORGB_SETUP
-      "dup         v22.8h, %w[limit]             \n"
-      "dup         v23.8h, %w[alpha]             \n"
-      "1:          \n"  //
+               "dup         v22.8h, %w[limit]             \n"
+               "dup         v23.8h, %w[alpha]             \n"
+               "1:          \n"  //
                READYUV210
-      "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
-      "b.gt        1b                            \n"
+               "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
+               "b.gt        1b                            \n"
                : [src_y] "+r"(src_y),             // %[src_y]
                  [src_u] "+r"(src_u),             // %[src_u]
                  [src_v] "+r"(src_v),             // %[src_v]
@@ -321,12 +321,12 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
   uint16_t limit = 0x3ff0;
   uint16_t alpha = 0xc000;
   asm volatile(YUVTORGB_SETUP
-      "dup         v22.8h, %w[limit]             \n"
-      "dup         v23.8h, %w[alpha]             \n"
-      "1:          \n"  //
+               "dup         v22.8h, %w[limit]             \n"
+               "dup         v23.8h, %w[alpha]             \n"
+               "1:          \n"  //
                READYUV410
-      "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
-      "b.gt        1b                            \n"
+               "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
+               "b.gt        1b                            \n"
                : [src_y] "+r"(src_y),             // %[src_y]
                  [src_u] "+r"(src_u),             // %[src_u]
                  [src_v] "+r"(src_v),             // %[src_v]
@@ -349,12 +349,12 @@ void I212ToAR30Row_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   const uint16_t limit = 0x3ff0;
   asm volatile(YUVTORGB_SETUP
-      "dup         v22.8h, %w[limit]             \n"
-      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
-      "1:          \n"                                //
+               "dup         v22.8h, %w[limit]             \n"
+               "movi        v23.8h, #0xc0, lsl #8         \n"  // A
+               "1:          \n"                                //
                READYUV212
-      "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
-      "b.gt        1b                            \n"
+               "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
+               "b.gt        1b                            \n"
                : [src_y] "+r"(src_y),             // %[src_y]
                  [src_u] "+r"(src_u),             // %[src_u]
                  [src_v] "+r"(src_v),             // %[src_v]
@@ -531,13 +531,13 @@ void P210ToAR30Row_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   const uint16_t limit = 0x3ff0;
   asm volatile(YUVTORGB_SETUP
-      "dup         v22.8h, %w[limit]             \n"
-      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
-      "ldr         q2, [%[kIndices]]             \n"
-      "1:          \n"  //
+               "dup         v22.8h, %w[limit]             \n"
+               "movi        v23.8h, #0xc0, lsl #8         \n"  // A
+               "ldr         q2, [%[kIndices]]             \n"
+               "1:          \n"  //
                READYUVP210
-      "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
-      "b.gt        1b                            \n"
+               "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
+               "b.gt        1b                            \n"
                : [src_y] "+r"(src_y),                     // %[src_y]
                  [src_uv] "+r"(src_uv),                   // %[src_uv]
                  [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
@@ -558,13 +558,13 @@ void P410ToAR30Row_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   uint16_t limit = 0x3ff0;
   asm volatile(YUVTORGB_SETUP
-      "dup         v22.8h, %w[limit]             \n"
-      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
-      "ldr         q2, [%[kIndices]]             \n"
-      "1:          \n"  //
+               "dup         v22.8h, %w[limit]             \n"
+               "movi        v23.8h, #0xc0, lsl #8         \n"  // A
+               "ldr         q2, [%[kIndices]]             \n"
+               "1:          \n"  //
                READYUVP410
-      "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
-      "b.gt        1b                            \n"
+               "subs        %w[width], %w[width], #8      \n" NVTORGB STOREAR30
+               "b.gt        1b                            \n"
                : [src_y] "+r"(src_y),                     // %[src_y]
                  [src_uv] "+r"(src_uv),                   // %[src_uv]
                  [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
@@ -783,9 +783,8 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "1:          \n"  //
-      READYUV422
-      "subs        %w[width], %w[width], #8      \n" I4XXTORGB RGBTORGB8_TOP
-          ARGBTORGB565_FROM_TOP
+      READYUV422 "subs        %w[width], %w[width], #8      \n" I4XXTORGB
+          RGBTORGB8_TOP ARGBTORGB565_FROM_TOP
       "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8 pixels RGB565.
       "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
@@ -1036,9 +1035,8 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
       YUVTORGB_SETUP
       "ldr         q2, [%[kNV12Table]]           \n"
       "1:          \n"  //
-      READNV12
-      "subs        %w[width], %w[width], #8      \n" NVTORGB RGBTORGB8_TOP
-          ARGBTORGB565_FROM_TOP
+      READNV12 "subs        %w[width], %w[width], #8      \n" NVTORGB
+          RGBTORGB8_TOP ARGBTORGB565_FROM_TOP
       "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8
                                                        // pixels
                                                        // RGB565.
@@ -2742,20 +2740,22 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
                                int width,
                                const struct ArgbConstants* c) {
   asm volatile(
-      "ldr        q16, [%[c], #16]               \n" // kRGBToU
-      "ldr        q17, [%[c], #32]               \n" // kRGBToV
-      "ldr        s0, [%[c], #64]                \n" // kAddUV
-      "sxtl       v16.8h, v16.8b                 \n" // sign extend U coeffs to 16-bit
-      "sxtl       v17.8h, v17.8b                 \n" // sign extend V coeffs to 16-bit
-      "dup        v20.8h, v16.h[0]               \n" // U0
-      "dup        v21.8h, v16.h[1]               \n" // U1
-      "dup        v22.8h, v16.h[2]               \n" // U2
-      "dup        v23.8h, v16.h[3]               \n" // U3
-      "dup        v24.8h, v17.h[0]               \n" // V0
-      "dup        v26.8h, v17.h[1]               \n" // V1
-      "dup        v27.8h, v17.h[2]               \n" // V2
-      "dup        v28.8h, v17.h[3]               \n" // V3
-      "dup        v25.8h, v0.h[0]                \n" // kAddUV
+      "ldr         q16, [%[c], #16]               \n"  // kRGBToU
+      "ldr         q17, [%[c], #32]               \n"  // kRGBToV
+      "ldr         s0, [%[c], #64]                \n"  // kAddUV
+      "sxtl        v16.8h, v16.8b                 \n"  // sign extend U coeffs
+                                                       // to 16-bit
+      "sxtl        v17.8h, v17.8b                 \n"  // sign extend V coeffs
+                                                       // to 16-bit
+      "dup         v20.8h, v16.h[0]               \n"  // U0
+      "dup         v21.8h, v16.h[1]               \n"  // U1
+      "dup         v22.8h, v16.h[2]               \n"  // U2
+      "dup         v23.8h, v16.h[3]               \n"  // U3
+      "dup         v24.8h, v17.h[0]               \n"  // V0
+      "dup         v26.8h, v17.h[1]               \n"  // V1
+      "dup         v27.8h, v17.h[2]               \n"  // V2
+      "dup         v28.8h, v17.h[3]               \n"  // V3
+      "dup         v25.8h, v0.h[0]                \n"  // kAddUV
       "1:          \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
@@ -2783,27 +2783,26 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
       "st1         {v0.8b}, [%1], #8             \n"
       "st1         {v1.8b}, [%2], #8             \n"
       "b.gt        1b                            \n"
-      : "+r"(src_argb),     // %0
-        "+r"(dst_u),        // %1
-        "+r"(dst_v),        // %2
-        "+r"(width)         // %3
-      : [c] "r"(c)          // %4
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
-        "v26", "v27", "v28");
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      : [c] "r"(c)       // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+        "v27", "v28");
 }
 
-static void ARGBToUV444MatrixRow_NEON_I8MM(
-    const uint8_t* src_argb,
-    uint8_t* dst_u,
-    uint8_t* dst_v,
-    int width,
-    const struct ArgbConstants* c) {
+static void ARGBToUV444MatrixRow_NEON_I8MM(const uint8_t* src_argb,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width,
+                                           const struct ArgbConstants* c) {
   asm volatile(
-      "ldr         q16, [%[c], #16]              \n" // kRGBToU
-      "ldr         q17, [%[c], #32]              \n" // kRGBToV
-      "ldr         s0, [%[c], #64]               \n" // kAddUV
-      "dup         v29.8h, v0.h[0]               \n" // 128.0
+      "ldr         q16, [%[c], #16]              \n"  // kRGBToU
+      "ldr         q17, [%[c], #32]              \n"  // kRGBToV
+      "ldr         s0, [%[c], #64]               \n"  // kAddUV
+      "dup         v29.8h, v0.h[0]               \n"  // 128.0
       "1:          \n"
       "ldp         q0, q1, [%[src]], #32         \n"
       "subs        %w[width], %w[width], #8      \n"  // 8 processed per loop.
@@ -2823,11 +2822,11 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
       "str         d0, [%[dst_u]], #8            \n"  // store 8 pixels U.
       "str         d1, [%[dst_v]], #8            \n"  // store 8 pixels V.
       "b.gt        1b                            \n"
-      : [src] "+r"(src_argb),     // %[src]
-        [dst_u] "+r"(dst_u),      // %[dst_u]
-        [dst_v] "+r"(dst_v),      // %[dst_v]
-        [width] "+r"(width)       // %[width]
-      : [c] "r"(c)  // %[c]
+      : [src] "+r"(src_argb),  // %[src]
+        [dst_u] "+r"(dst_u),   // %[dst_u]
+        [dst_v] "+r"(dst_v),   // %[dst_v]
+        [width] "+r"(width)    // %[width]
+      : [c] "r"(c)             // %[c]
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17",
         "v29");
 }
@@ -2844,8 +2843,7 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
-                            &kArgbI601Constants);
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbI601Constants);
 }
 
 void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
@@ -2860,8 +2858,7 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
-                            &kArgbJPEGConstants);
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbJPEGConstants);
 }
 
 void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
@@ -2903,23 +2900,27 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
                             int width,
                             const struct ArgbConstants* c) {
   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile (
-      "ldr        q16, [%[c], #16]               \n" // kRGBToU
-      "ldr        q17, [%[c], #32]               \n" // kRGBToV
-      "sxtl       v16.8h, v16.8b                 \n" // sign extend U coeffs to 16-bit
-      "sxtl       v17.8h, v17.8b                 \n" // sign extend V coeffs to 16-bit
-      "dup        v20.8h, v16.h[0]               \n" // U0
-      "dup        v21.8h, v16.h[1]               \n" // U1
-      "dup        v22.8h, v16.h[2]               \n" // U2
-      "dup        v23.8h, v16.h[3]               \n" // U3
-      "dup        v24.8h, v17.h[0]               \n" // V0
-      "dup        v26.8h, v17.h[1]               \n" // V1
-      "dup        v27.8h, v17.h[2]               \n" // V2
-      "dup        v28.8h, v17.h[3]               \n" // V3
-      "movi       v25.8h, #0x80, lsl #8          \n" // 128.0 in 16-bit (0x8000)
+  asm volatile(
+      "ldr         q16, [%[c], #16]               \n"  // kRGBToU
+      "ldr         q17, [%[c], #32]               \n"  // kRGBToV
+      "sxtl        v16.8h, v16.8b                 \n"  // sign extend U coeffs
+                                                       // to 16-bit
+      "sxtl        v17.8h, v17.8b                 \n"  // sign extend V coeffs
+                                                       // to 16-bit
+      "dup         v20.8h, v16.h[0]               \n"  // U0
+      "dup         v21.8h, v16.h[1]               \n"  // U1
+      "dup         v22.8h, v16.h[2]               \n"  // U2
+      "dup         v23.8h, v16.h[3]               \n"  // U3
+      "dup         v24.8h, v17.h[0]               \n"  // V0
+      "dup         v26.8h, v17.h[1]               \n"  // V1
+      "dup         v27.8h, v17.h[2]               \n"  // V2
+      "dup         v28.8h, v17.h[3]               \n"  // V3
+      "movi        v25.8h, #0x80, lsl #8          \n"  // 128.0 in 16-bit
+                                                       // (0x8000)
 
       "1:          \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+                                                                 // pixels.
       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
       "prfm        pldl1keep, [%0, 448]          \n"
@@ -2927,7 +2928,8 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
       "uaddlp      v18.8h, v3.16b                \n"  // A 16 bytes -> 8 shorts.
 
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16
+                                                                 // more.
       "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
       "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
@@ -2940,34 +2942,33 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
       "urshr       v18.8h, v18.8h, #2             \n"
 
       // U = B*U0 + G*U1 + R*U2 + A*U3
-      "mul        v3.8h, v0.8h, v20.8h          \n"
-      "mla        v3.8h, v1.8h, v21.8h          \n"
-      "mla        v3.8h, v2.8h, v22.8h          \n"
-      "mla        v3.8h, v18.8h, v23.8h         \n"
+      "mul         v3.8h, v0.8h, v20.8h          \n"
+      "mla         v3.8h, v1.8h, v21.8h          \n"
+      "mla         v3.8h, v2.8h, v22.8h          \n"
+      "mla         v3.8h, v18.8h, v23.8h         \n"
 
       // V = B*V0 + G*V1 + R*V2 + A*V3
-      "mul        v4.8h, v0.8h, v24.8h          \n"
-      "mla        v4.8h, v1.8h, v26.8h          \n"
-      "mla        v4.8h, v2.8h, v27.8h          \n"
-      "mla        v4.8h, v18.8h, v28.8h         \n"
+      "mul         v4.8h, v0.8h, v24.8h          \n"
+      "mla         v4.8h, v1.8h, v26.8h          \n"
+      "mla         v4.8h, v2.8h, v27.8h          \n"
+      "mla         v4.8h, v18.8h, v28.8h         \n"
 
       // U = (128.0 - U) >> 8, V = (128.0 - V) >> 8
-      "subhn      v0.8b, v25.8h, v3.8h           \n"
-      "subhn      v1.8b, v25.8h, v4.8h           \n"
+      "subhn       v0.8b, v25.8h, v3.8h           \n"
+      "subhn       v1.8b, v25.8h, v4.8h           \n"
 
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
       "b.gt        1b                            \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  : [c] "r"(c)         // %5
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v16", "v17", "v18", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-    "v27", "v28"
-  );
+      : "+r"(src_argb),    // %0
+        "+r"(src_argb_1),  // %1
+        "+r"(dst_u),       // %2
+        "+r"(dst_v),       // %3
+        "+r"(width)        // %4
+      : [c] "r"(c)         // %5
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28");
 }
 
 void ARGBToUVRow_NEON(const uint8_t* src_argb,
@@ -3330,11 +3331,11 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
 
 // Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the ArgbConstants layout.
 static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
-                                        int src_stride,
-                                        uint8_t* dst_u,
-                                        uint8_t* dst_v,
-                                        int width,
-                                        const struct ArgbConstants* c) {
+                                             int src_stride,
+                                             uint8_t* dst_u,
+                                             uint8_t* dst_v,
+                                             int width,
+                                             const struct ArgbConstants* c) {
   const uint8_t* src1 = src + src_stride;
   asm volatile(
       "movi        v23.8h, #0x80, lsl #8           \n"  // 128.0 (0x8000 in
@@ -3388,12 +3389,12 @@ static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
       "str         d0, [%[dst_u]], #8              \n"  // store 8 pixels U
       "str         d1, [%[dst_v]], #8              \n"  // store 8 pixels V
       "b.gt        1b                              \n"
-      : [src] "+r"(src),                // %[src]
-        [src1] "+r"(src1),              // %[src1]
-        [dst_u] "+r"(dst_u),            // %[dst_u]
-        [dst_v] "+r"(dst_v),            // %[dst_v]
-        [width] "+r"(width)             // %[width]
-      : [c] "r"(c)                      // %[c]
+      : [src] "+r"(src),      // %[src]
+        [src1] "+r"(src1),    // %[src1]
+        [dst_u] "+r"(dst_u),  // %[dst_u]
+        [dst_v] "+r"(dst_v),  // %[dst_v]
+        [width] "+r"(width)   // %[width]
+      : [c] "r"(c)            // %[c]
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v23",
         "v24", "v25");
 }
@@ -3404,8 +3405,8 @@ void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
                                  uint8_t* dst_v,
                                  int width,
                                  const struct ArgbConstants* c) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
-                                   c);
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v,
+                                   width, c);
 }
 
 void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
@@ -3413,8 +3414,8 @@ void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
-                              &kArgbI601Constants);
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v,
+                                   width, &kArgbI601Constants);
 }
 
 void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
@@ -3422,8 +3423,8 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
-                              &kAbgrI601Constants);
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v,
+                                   width, &kAbgrI601Constants);
 }
 
 void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
@@ -3431,8 +3432,8 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width,
-                              &kBgraI601Constants);
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v,
+                                   width, &kBgraI601Constants);
 }
 
 void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
@@ -3440,8 +3441,8 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width,
-                              &kRgbaI601Constants);
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v,
+                                   width, &kRgbaI601Constants);
 }
 
 void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
@@ -3449,8 +3450,8 @@ void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
-                              &kArgbJPEGConstants);
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v,
+                                   width, &kArgbJPEGConstants);
 }
 
 void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
@@ -3458,8 +3459,8 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width) {
-  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
-                              &kAbgrJPEGConstants);
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v,
+                                   width, &kAbgrJPEGConstants);
 }
 
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@@ -3558,13 +3559,11 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
       : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
 }
 
-
-
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
-                                  uint8_t* dst_y,
-                                  int width,
-                                  const struct ArgbConstants* c) {
+                           uint8_t* dst_y,
+                           int width,
+                           const struct ArgbConstants* c) {
   asm volatile(
       "ldr         s16, [%3]                     \n"  // load 4 coeffs
       "ldr         s17, [%3, #48]                \n"  // load kAddY[0]
@@ -3589,20 +3588,18 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
       "addhn       v1.8b, v1.8h, v22.8h          \n"
       "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
       "b.gt        1b                            \n"
-      : "+r"(src_argb),    // %0
-        "+r"(dst_y),       // %1
-        "+r"(width)        // %2
-      : "r"(c)             // %3
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "r"(c)           // %3
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
         "v19", "v20", "v21", "v22");
 }
 
-
-void ARGBToYMatrixRow_NEON_DotProd(
-    const uint8_t* src_argb,
-    uint8_t* dst_y,
-    int width,
-    const struct ArgbConstants* c) {
+void ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb,
+                                   uint8_t* dst_y,
+                                   int width,
+                                   const struct ArgbConstants* c) {
   asm volatile(
       "ldr         s16, [%3]                     \n"  // load 4 coeffs
       "ldr         s17, [%3, #48]                \n"  // load kAddY[0]
@@ -3625,14 +3622,14 @@ void ARGBToYMatrixRow_NEON_DotProd(
       "addhn       v1.8b, v1.8h, v19.8h          \n"
       "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
       "b.gt        1b                            \n"
-      : "+r"(src_argb),    // %0
-        "+r"(dst_y),       // %1
-        "+r"(width)        // %2
-      : "r"(c)             // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19");
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "r"(c)           // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19");
 }
 
-
 // RGB to JPeg coefficients
 
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -3708,9 +3705,9 @@ void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra,
 }
 
 void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
-                                 uint8_t* dst_y,
-                                 int width,
-                                 const struct ArgbConstants* c) {
+                          uint8_t* dst_y,
+                          int width,
+                          const struct ArgbConstants* c) {
   asm volatile(
       "ldr         s16, [%3]                     \n"  // load 4 coeffs
       "ldr         s17, [%3, #48]                \n"  // load kAddY[0]
@@ -3732,18 +3729,14 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
       "addhn       v1.8b, v1.8h, v21.8h          \n"
       "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
       "b.gt        1b                            \n"
-      : "+r"(src_rgb),     // %0
-        "+r"(dst_y),       // %1
-        "+r"(width)        // %2
-      : "r"(c)  // %3
+      : "+r"(src_rgb),  // %0
+        "+r"(dst_y),    // %1
+        "+r"(width)     // %2
+      : "r"(c)          // %3
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18",
         "v19", "v20", "v21");
 }
 
-
-
-
-
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_NEON(uint8_t* dst_ptr,
                          const uint8_t* src_ptr,
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
index 93bc431bc..91752ed16 100644
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@@ -1249,16 +1249,22 @@ void MergeUVRow_RVV(const uint8_t* src_u,
 }
 #endif
 
-
-
 // RGB to JPeg coefficients
 // B * 0.1140 coefficient = 29
 // G * 0.5870 coefficient = 150
 // R * 0.2990 coefficient = 77
 // Add 0.5 = 0x80
-static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {0}, {0}, {128}, {0}};
+static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+                                                         {0},
+                                                         {0},
+                                                         {128},
+                                                         {0}};
 
-static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {0}, {0}, {128}, {0}};
+static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0},
+                                                       {0},
+                                                       {0},
+                                                       {128},
+                                                       {0}};
 
 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
@@ -1266,16 +1272,24 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {0}, {0
 // R * 0.2578 coefficient = 66
 // Add 16.5 = 0x1080
 
-static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {0}, {0}, {0x1080}, {0}};
+static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+                                                         {0},
+                                                         {0},
+                                                         {0x1080},
+                                                         {0}};
 
-static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {0}, {0}, {0x1080}, {0}};
+static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0},
+                                                       {0},
+                                                       {0},
+                                                       {0x1080},
+                                                       {0}};
 
 // ARGB expects first 3 values to contain RGB and 4th value is ignored
 #ifdef HAS_ARGBTOYMATRIXROW_RVV
 void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
-                                 uint8_t* dst_y,
-                                 int width,
-                                 const struct ArgbConstants* c) {
+                          uint8_t* dst_y,
+                          int width,
+                          const struct ArgbConstants* c) {
   assert(width != 0);
   size_t w = (size_t)width;
   vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
diff --git a/source/row_sme.cc b/source/row_sme.cc
index fca536dc4..2291562e2 100644
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@@ -1127,9 +1127,10 @@ __arm_locally_streaming void ARGBToUVMatrixRow_SME(
     uint8_t* dst_v,
     int width,
     const struct ArgbConstants* c) {
-  int8_t uvconstants[8] = {
-      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
-      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
+  int8_t uvconstants[8] = {(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1],
+                           (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
+                           (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1],
+                           (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
   ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
                            uvconstants);
 }
diff --git a/source/row_sve.cc b/source/row_sve.cc
index 7d8734921..662685882 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -223,9 +223,10 @@ void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
                             uint8_t* dst_v,
                             int width,
                             const struct ArgbConstants* c) {
-  int8_t uvconstants[8] = {
-      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
-      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
+  int8_t uvconstants[8] = {(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1],
+                           (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
+                           (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1],
+                           (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
   ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
                            uvconstants);
 }
diff --git a/source/row_win.cc b/source/row_win.cc
index 441fe1451..a7ed75199 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -8,19 +8,19 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/convert_from_argb.h"  // For ArgbConstants
+#include "libyuv/row.h"
 
 // This module is for Visual C 32/64 bit
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__) || \
-     defined(_M_X64) || defined(_M_X86)) && \
-    ((defined(_MSC_VER) && !defined(__clang__)) || \
+#if !defined(LIBYUV_DISABLE_X86) &&                                 \
+    (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || \
+     defined(_M_X86)) &&                                            \
+    ((defined(_MSC_VER) && !defined(__clang__)) ||                  \
      defined(LIBYUV_ENABLE_ROWWIN))
 
 #include <emmintrin.h>
-#include <tmmintrin.h>  // For _mm_maddubs_epi16
 #include <immintrin.h>  // For AVX2 intrinsics
+#include <tmmintrin.h>  // For _mm_maddubs_epi16
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -266,27 +266,33 @@ void BGRAToYRow_AVX2(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
 LIBYUV_TARGET_AVX2
 void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
   __m256i ymm_alpha = _mm256_set1_epi32(0xff000000);
-  __m128i shuf_low = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
-  __m128i shuf_high = _mm_set_epi8(-1, 13, 14, 15, -1, 10, 11, 12, -1, 7, 8, 9, -1, 4, 5, 6);
+  __m128i shuf_low =
+      _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
+  __m128i shuf_high =
+      _mm_set_epi8(-1, 13, 14, 15, -1, 10, 11, 12, -1, 7, 8, 9, -1, 4, 5, 6);
   __m256i ymm_shuf = _mm256_broadcastsi128_si256(shuf_low);
   __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(shuf_high);
 
   while (width > 0) {
     __m128i xmm0 = _mm_loadu_si128((const __m128i*)src_raw);
     __m256i ymm0 = _mm256_castsi128_si256(xmm0);
-    ymm0 = _mm256_inserti128_si256(ymm0, _mm_loadu_si128((const __m128i*)(src_raw + 12)), 1);
+    ymm0 = _mm256_inserti128_si256(
+        ymm0, _mm_loadu_si128((const __m128i*)(src_raw + 12)), 1);
 
     __m128i xmm1 = _mm_loadu_si128((const __m128i*)(src_raw + 24));
     __m256i ymm1 = _mm256_castsi128_si256(xmm1);
-    ymm1 = _mm256_inserti128_si256(ymm1, _mm_loadu_si128((const __m128i*)(src_raw + 36)), 1);
+    ymm1 = _mm256_inserti128_si256(
+        ymm1, _mm_loadu_si128((const __m128i*)(src_raw + 36)), 1);
 
     __m128i xmm2 = _mm_loadu_si128((const __m128i*)(src_raw + 48));
     __m256i ymm2 = _mm256_castsi128_si256(xmm2);
-    ymm2 = _mm256_inserti128_si256(ymm2, _mm_loadu_si128((const __m128i*)(src_raw + 60)), 1);
+    ymm2 = _mm256_inserti128_si256(
+        ymm2, _mm_loadu_si128((const __m128i*)(src_raw + 60)), 1);
 
     __m128i xmm3 = _mm_loadu_si128((const __m128i*)(src_raw + 68));
     __m256i ymm3 = _mm256_castsi128_si256(xmm3);
-    ymm3 = _mm256_inserti128_si256(ymm3, _mm_loadu_si128((const __m128i*)(src_raw + 80)), 1);
+    ymm3 = _mm256_inserti128_si256(
+        ymm3, _mm_loadu_si128((const __m128i*)(src_raw + 80)), 1);
 
     ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
     ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
@@ -312,10 +318,13 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
 
 #ifdef HAS_RAWTOARGBROW_AVX512BW
 LIBYUV_TARGET_AVX512BW
-void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const __m128i* shuffler, int width) {
+void RGBToARGBRow_AVX512BW(const uint8_t* src_raw,
+                           uint8_t* dst_argb,
+                           const __m128i* shuffler,
+                           int width) {
   __m512i zmm_alpha = _mm512_set1_epi32(0xff000000);
-  __m512i zmm_perm = _mm512_set_epi32(
-      12, 11, 10, 9, 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 1, 0);
+  __m512i zmm_perm =
+      _mm512_set_epi32(12, 11, 10, 9, 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 1, 0);
   __m512i zmm_shuf = _mm512_broadcast_i32x4(_mm_loadu_si128(shuffler));
 
   while (width > 0) {
@@ -351,14 +360,20 @@ void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const __m1
 }
 
 LIBYUV_TARGET_AVX512BW
-void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  __m128i shuf = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
+void RAWToARGBRow_AVX512BW(const uint8_t* src_raw,
+                           uint8_t* dst_argb,
+                           int width) {
+  __m128i shuf =
+      _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
   RGBToARGBRow_AVX512BW(src_raw, dst_argb, &shuf, width);
 }
 
 LIBYUV_TARGET_AVX512BW
-void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
-  __m128i shuf = _mm_set_epi8(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0);
+void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24,
+                             uint8_t* dst_argb,
+                             int width) {
+  __m128i shuf =
+      _mm_set_epi8(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0);
   RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, &shuf, width);
 }
 #endif
@@ -374,16 +389,19 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
   __m256i ymm_u = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU));
   __m256i ymm_v = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV));
   __m256i ymm_0101 = _mm256_set1_epi16(0x0101);
-  __m256i ymm_shuf = _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
-                                      0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
+  __m256i ymm_shuf =
+      _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15, 0,
+                       4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
   __m256i ymm_8000 = _mm256_set1_epi16((short)0x8000);
   __m256i ymm_zero = _mm256_setzero_si256();
 
   while (width > 0) {
     __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb);
     __m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
-    __m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb));
-    __m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32));
+    __m256i ymm2 =
+        _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb));
+    __m256i ymm3 =
+        _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32));
 
     ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
     ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
@@ -455,8 +473,8 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
 #ifdef HAS_MIRRORROW_AVX2
 LIBYUV_TARGET_AVX2
 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  __m256i ymm_shuf =
-      _mm256_broadcastsi128_si256(_mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
+      _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
   src += width;
   while (width > 0) {
     src -= 32;
@@ -473,8 +491,8 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_MIRRORUVROW_AVX2
 LIBYUV_TARGET_AVX2
 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  __m256i ymm_shuf =
-      _mm256_broadcastsi128_si256(_mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
+  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
+      _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
   src_uv += width * 2;
   while (width > 0) {
     src_uv -= 32;
@@ -494,8 +512,8 @@ void MirrorSplitUVRow_AVX2(const uint8_t* src_uv,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  __m256i ymm_shuf =
-      _mm256_broadcastsi128_si256(_mm_setr_epi8(14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1));
+  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
+      _mm_setr_epi8(14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1));
   src_uv += width * 2;
   while (width > 0) {
     src_uv -= 32;
@@ -516,25 +534,28 @@ LIBYUV_TARGET_AVX2
 void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
                          uint8_t* dst_rgb24,
                          int width) {
-  __m256i shuf0 = _mm256_setr_epi8(
-      -1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2,
-      -1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2);
-  __m128i shuf1 = _mm_setr_epi8(
-      13, 14, 15, 10, 11, 12, 7, 8, 9, 4, 5, 6, 1, 2, 3, -1);
+  __m256i shuf0 =
+      _mm256_setr_epi8(-1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2, -1,
+                       12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2);
+  __m128i shuf1 =
+      _mm_setr_epi8(13, 14, 15, 10, 11, 12, 7, 8, 9, 4, 5, 6, 1, 2, 3, -1);
 
   src_rgb24 += width * 3 - 96;
   while (width > 0) {
     __m128i v0_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 0));
     __m128i v0_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 15));
-    __m256i v0 = _mm256_inserti128_si256(_mm256_castsi128_si256(v0_lo), v0_hi, 1);
+    __m256i v0 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(v0_lo), v0_hi, 1);
 
     __m128i v1_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 30));
     __m128i v1_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 45));
-    __m256i v1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v1_lo), v1_hi, 1);
+    __m256i v1 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(v1_lo), v1_hi, 1);
 
     __m128i v2_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 60));
     __m128i v2_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 75));
-    __m256i v2 = _mm256_inserti128_si256(_mm256_castsi128_si256(v2_lo), v2_hi, 1);
+    __m256i v2 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(v2_lo), v2_hi, 1);
 
     __m128i v3 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 80));
 
@@ -544,11 +565,14 @@ void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24,
     v3 = _mm_shuffle_epi8(v3, shuf1);
 
     _mm_storeu_si128((__m128i*)(dst_rgb24 + 80), _mm256_castsi256_si128(v0));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 65), _mm256_extracti128_si256(v0, 1));
+    _mm_storeu_si128((__m128i*)(dst_rgb24 + 65),
+                     _mm256_extracti128_si256(v0, 1));
     _mm_storeu_si128((__m128i*)(dst_rgb24 + 50), _mm256_castsi256_si128(v1));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 35), _mm256_extracti128_si256(v1, 1));
+    _mm_storeu_si128((__m128i*)(dst_rgb24 + 35),
+                     _mm256_extracti128_si256(v1, 1));
     _mm_storeu_si128((__m128i*)(dst_rgb24 + 20), _mm256_castsi256_si128(v2));
-    _mm_storeu_si128((__m128i*)(dst_rgb24 + 5), _mm256_extracti128_si256(v2, 1));
+    _mm_storeu_si128((__m128i*)(dst_rgb24 + 5),
+                     _mm256_extracti128_si256(v2, 1));
     _mm_storel_epi64((__m128i*)(dst_rgb24 + 0), v3);
 
     src_rgb24 -= 96;
@@ -629,7 +653,8 @@ void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
     for (i = 0; i < width; i += 16) {
       __m256i row0 = _mm256_loadu_si256((const __m256i*)(src_ptr + i));
       __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_ptr1 + i));
-      _mm256_storeu_si256((__m256i*)(dst_ptr + i), _mm256_avg_epu16(row0, row1));
+      _mm256_storeu_si256((__m256i*)(dst_ptr + i),
+                          _mm256_avg_epu16(row0, row1));
     }
   } else {
     for (i = 0; i < width; i += 16) {
@@ -672,21 +697,23 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_J400TOARGBROW_AVX2
 alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_0[32] = {
     0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, 3u, 128u,
-    4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u
-};
+    4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u};
 alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_1[32] = {
-    8u, 8u, 8u, 128u, 9u, 9u, 9u, 128u, 10u, 10u, 10u, 128u, 11u, 11u, 11u, 128u,
-    12u, 12u, 12u, 128u, 13u, 13u, 13u, 128u, 14u, 14u, 14u, 128u, 15u, 15u, 15u, 128u
-};
+    8u,   8u,   8u,  128u, 9u,   9u,   9u,  128u, 10u,  10u, 10u,
+    128u, 11u,  11u, 11u,  128u, 12u,  12u, 12u,  128u, 13u, 13u,
+    13u,  128u, 14u, 14u,  14u,  128u, 15u, 15u,  15u,  128u};
 
 LIBYUV_TARGET_AVX2
 void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  __m256i ymm_mask0 = _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_0);
-  __m256i ymm_mask1 = _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_1);
+  __m256i ymm_mask0 =
+      _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_0);
+  __m256i ymm_mask1 =
+      _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_1);
   __m256i ymm_alpha = _mm256_set1_epi32((int)0xff000000u);
 
   while (width > 0) {
-    __m256i ymm0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)src_y));
+    __m256i ymm0 =
+        _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)src_y));
 
     __m256i ymm1 = _mm256_shuffle_epi8(ymm0, ymm_mask0);
     __m256i ymm2 = _mm256_shuffle_epi8(ymm0, ymm_mask1);
@@ -707,13 +734,15 @@ void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 #ifdef HAS_RGB24TOARGBROW_AVX2
 alignas(16) static const uint8_t kShuffleMaskRGB24ToARGB[2][16] = {
     {0u, 1u, 2u, 128u, 3u, 4u, 5u, 128u, 6u, 7u, 8u, 128u, 9u, 10u, 11u, 128u},
-    {4u, 5u, 6u, 128u, 7u, 8u, 9u, 128u, 10u, 11u, 12u, 128u, 13u, 14u, 15u, 128u}
-};
+    {4u, 5u, 6u, 128u, 7u, 8u, 9u, 128u, 10u, 11u, 12u, 128u, 13u, 14u, 15u,
+     128u}};
 #endif
 
 #ifdef HAS_RGB565TOARGBROW_AVX2
 LIBYUV_TARGET_AVX2
-void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) {
+void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width) {
   __m256i ymm_scale_rb = _mm256_set1_epi32(0x01080108);
   __m256i ymm_scale_g = _mm256_set1_epi32(0x20802080);
   __m256i ymm_mask_b = _mm256_set1_epi16((short)0xf800);
@@ -730,11 +759,11 @@ void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int widt
     ymm1 = _mm256_mulhi_epu16(ymm1, ymm_scale_rb);
     ymm2 = _mm256_mulhi_epu16(ymm2, ymm_scale_rb);
     ymm1 = _mm256_slli_epi16(ymm1, 8);
-    ymm1 = _mm256_or_si256(ymm1, ymm2); // RB
+    ymm1 = _mm256_or_si256(ymm1, ymm2);  // RB
 
     ymm0 = _mm256_and_si256(ymm0, ymm_mask_g);
     ymm0 = _mm256_mulhi_epu16(ymm0, ymm_scale_g);
-    ymm0 = _mm256_or_si256(ymm0, ymm_mask_a); // GA
+    ymm0 = _mm256_or_si256(ymm0, ymm_mask_a);  // GA
 
     ymm2 = _mm256_unpacklo_epi8(ymm1, ymm0);
     ymm1 = _mm256_unpackhi_epi8(ymm1, ymm0);
@@ -755,7 +784,9 @@ void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int widt
 
 #ifdef HAS_ARGB1555TOARGBROW_AVX2
 LIBYUV_TARGET_AVX2
-void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) {
+void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width) {
   __m256i ymm_scale_rb = _mm256_set1_epi32(0x01080108);
   __m256i ymm_scale_g = _mm256_set1_epi32(0x42004200);
   __m256i ymm_mask_b = _mm256_set1_epi16((short)0xf800);
@@ -773,14 +804,14 @@ void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_argb, int
     ymm2 = _mm256_mulhi_epu16(ymm2, ymm_scale_rb);
     ymm1 = _mm256_mulhi_epu16(ymm1, ymm_scale_rb);
     ymm1 = _mm256_slli_epi16(ymm1, 8);
-    ymm1 = _mm256_or_si256(ymm1, ymm2); // RB
+    ymm1 = _mm256_or_si256(ymm1, ymm2);  // RB
 
     ymm2 = ymm0;
     ymm0 = _mm256_and_si256(ymm0, ymm_mask_g);
     ymm2 = _mm256_srai_epi16(ymm2, 8);
     ymm0 = _mm256_mulhi_epu16(ymm0, ymm_scale_g);
     ymm2 = _mm256_and_si256(ymm2, ymm_mask_a);
-    ymm0 = _mm256_or_si256(ymm0, ymm2); // GA
+    ymm0 = _mm256_or_si256(ymm0, ymm2);  // GA
 
     ymm2 = _mm256_unpacklo_epi8(ymm1, ymm0);
     ymm1 = _mm256_unpackhi_epi8(ymm1, ymm0);
@@ -801,7 +832,9 @@ void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_argb, int
 
 #ifdef HAS_ARGB4444TOARGBROW_AVX2
 LIBYUV_TARGET_AVX2
-void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) {
+void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width) {
   __m256i ymm_mask = _mm256_set1_epi32(0x0f0f0f0f);
   __m256i ymm_mask2 = _mm256_slli_epi32(ymm_mask, 4);
 
@@ -841,27 +874,35 @@ void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, uint8_t* dst_argb, int
 
 #ifdef HAS_RGB24TOARGBROW_AVX2
 LIBYUV_TARGET_AVX2
-void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
+void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width) {
   __m256i ymm_alpha = _mm256_set1_epi32(0xff000000);
-  __m256i ymm_shuf = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[0]));
-  __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[1]));
+  __m256i ymm_shuf = _mm256_broadcastsi128_si256(
+      _mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[0]));
+  __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(
+      _mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[1]));
 
   while (width > 0) {
     __m128i xmm0 = _mm_loadu_si128((const __m128i*)src_rgb24);
     __m256i ymm0 = _mm256_castsi128_si256(xmm0);
-    ymm0 = _mm256_inserti128_si256(ymm0, _mm_loadu_si128((const __m128i*)(src_rgb24 + 12)), 1);
+    ymm0 = _mm256_inserti128_si256(
+        ymm0, _mm_loadu_si128((const __m128i*)(src_rgb24 + 12)), 1);
 
     __m128i xmm1 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 24));
     __m256i ymm1 = _mm256_castsi128_si256(xmm1);
-    ymm1 = _mm256_inserti128_si256(ymm1, _mm_loadu_si128((const __m128i*)(src_rgb24 + 36)), 1);
+    ymm1 = _mm256_inserti128_si256(
+        ymm1, _mm_loadu_si128((const __m128i*)(src_rgb24 + 36)), 1);
 
     __m128i xmm2 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 48));
     __m256i ymm2 = _mm256_castsi128_si256(xmm2);
-    ymm2 = _mm256_inserti128_si256(ymm2, _mm_loadu_si128((const __m128i*)(src_rgb24 + 60)), 1);
+    ymm2 = _mm256_inserti128_si256(
+        ymm2, _mm_loadu_si128((const __m128i*)(src_rgb24 + 60)), 1);
 
     __m128i xmm3 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 68));
     __m256i ymm3 = _mm256_castsi128_si256(xmm3);
-    ymm3 = _mm256_inserti128_si256(ymm3, _mm_loadu_si128((const __m128i*)(src_rgb24 + 80)), 1);
+    ymm3 = _mm256_inserti128_si256(
+        ymm3, _mm_loadu_si128((const __m128i*)(src_rgb24 + 80)), 1);
 
     ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
     ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
@@ -886,6 +927,50 @@ void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width)
 }
 #endif
 
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+LIBYUV_TARGET_AVX2
+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  __m256i control =
+      _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)shuffler));
+  while (width >= 16) {
+    __m256i row = _mm256_loadu_si256((const __m256i*)src_argb);
+    __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
+    row = _mm256_shuffle_epi8(row, control);
+    row1 = _mm256_shuffle_epi8(row1, control);
+    _mm256_storeu_si256((__m256i*)dst_argb, row);
+    _mm256_storeu_si256((__m256i*)(dst_argb + 32), row1);
+    src_argb += 64;
+    dst_argb += 64;
+    width -= 16;
+  }
+}
+#endif
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX512BW
+LIBYUV_TARGET_AVX512BW
+void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             const uint8_t* shuffler,
+                             int width) {
+  __m512i control =
+      _mm512_broadcast_i32x4(_mm_loadu_si128((const __m128i*)shuffler));
+  while (width >= 32) {
+    __m512i row = _mm512_loadu_si512((const __m512i*)src_argb);
+    __m512i row1 = _mm512_loadu_si512((const __m512i*)(src_argb + 64));
+    row = _mm512_shuffle_epi8(row, control);
+    row1 = _mm512_shuffle_epi8(row1, control);
+    _mm512_storeu_si512((__m512i*)dst_argb, row);
+    _mm512_storeu_si512((__m512i*)(dst_argb + 64), row1);
+    src_argb += 128;
+    dst_argb += 128;
+    width -= 32;
+  }
+}
+#endif
+
 #endif
 
 #ifdef __cplusplus
@@ -893,4 +978,7 @@ void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width)
 }  // namespace libyuv
 #endif
 
-#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_X86)) && ((defined(_MSC_VER) && !defined(__clang__)) || defined(LIBYUV_ENABLE_ROWWIN))
+#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) ||
+        // defined(__i386__) || defined(_M_X64) || defined(_M_X86)) &&
+        // ((defined(_MSC_VER) && !defined(__clang__)) ||
+        // defined(LIBYUV_ENABLE_ROWWIN))
diff --git a/source/scale.cc b/source/scale.cc
index 0064a0991..4b7b2d3bc 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1951,9 +1951,9 @@ int ScalePlane(const uint8_t* src,
   // Reject dimensions larger than 32768 (or smaller than -32768 for height).
   // This prevents FixedDiv signed integer overflows that can lead to division
   // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations.
-  if (!src || src_width <= 0 || src_height == 0 ||
-      src_width > 32768 || src_height < -32768 || src_height > 32768 ||
-      !dst || dst_width <= 0 || dst_height <= 0) {
+  if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 ||
+      src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 ||
+      dst_height <= 0) {
     return -1;
   }
   // Simplify filtering when possible.
@@ -2059,9 +2059,9 @@ int ScalePlane_16(const uint16_t* src,
   // Reject dimensions larger than 32768 (or smaller than -32768 for height).
   // This prevents FixedDiv signed integer overflows that can lead to division
   // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations.
-  if (!src || src_width <= 0 || src_height == 0 ||
-      src_width > 32768 || src_height < -32768 || src_height > 32768 ||
-      !dst || dst_width <= 0 || dst_height <= 0) {
+  if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 ||
+      src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 ||
+      dst_height <= 0) {
     return -1;
   }
   // Simplify filtering when possible.
@@ -2171,9 +2171,9 @@ int ScalePlane_12(const uint16_t* src,
   // Reject dimensions larger than 32768 (or smaller than -32768 for height).
   // This prevents FixedDiv signed integer overflows that can lead to division
   // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations.
-  if (!src || src_width <= 0 || src_height == 0 ||
-      src_width > 32768 || src_height < -32768 || src_height > 32768 ||
-      !dst || dst_width <= 0 || dst_height <= 0) {
+  if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 ||
+      src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 ||
+      dst_height <= 0) {
     return -1;
   }
   // Simplify filtering when possible.
diff --git a/source/scale_common.cc b/source/scale_common.cc
index 7040d0add..e2447119b 100644
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -792,10 +792,10 @@ void ScaleFilterCols64_C(uint8_t* dst_ptr,
 #undef BLENDER
 
 // Same as 8 bit arm blender but return is cast to uint16_t
-#define BLENDER(a, b, f) \
-  (uint16_t)(            \
-      (int)(a) +         \
-      (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16))
+#define BLENDER(a, b, f)                                                      \
+  (uint16_t)((int)(a) +                                                       \
+             (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> \
+                   16))
 
 void ScaleFilterCols_16_C(uint16_t* dst_ptr,
                           const uint16_t* src_ptr,
@@ -1196,7 +1196,7 @@ void ScaleARGBColsUp2_C(uint8_t* dst_argb,
 
 // TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
 // Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
 #define BLENDERC(a, b, f, s) \
   (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
 #define BLENDER(a, b, f)                                                 \
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index 5338482c5..773076669 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -1759,25 +1759,25 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-      asm volatile("pxor        %%xmm5,%%xmm5                 \n"
+  asm volatile("pxor        %%xmm5,%%xmm5                 \n"
 
                // 16 pixel loop.
                LABELALIGN
-      "1:          \n"
-      "movdqu      (%0),%%xmm3                   \n"
-      "lea         0x10(%0),%0                   \n"  // src_ptr += 16
-      "movdqu      (%1),%%xmm0                   \n"
-      "movdqu      0x10(%1),%%xmm1               \n"
-      "movdqa      %%xmm3,%%xmm2                 \n"
-      "punpcklbw   %%xmm5,%%xmm2                 \n"
-      "punpckhbw   %%xmm5,%%xmm3                 \n"
-      "paddusw     %%xmm2,%%xmm0                 \n"
-      "paddusw     %%xmm3,%%xmm1                 \n"
-      "movdqu      %%xmm0,(%1)                   \n"
-      "movdqu      %%xmm1,0x10(%1)               \n"
-      "lea         0x20(%1),%1                   \n"
-      "sub         $0x10,%2                      \n"
-      "jg          1b                            \n"
+               "1:          \n"
+               "movdqu      (%0),%%xmm3                   \n"
+               "lea         0x10(%0),%0                   \n"  // src_ptr += 16
+               "movdqu      (%1),%%xmm0                   \n"
+               "movdqu      0x10(%1),%%xmm1               \n"
+               "movdqa      %%xmm3,%%xmm2                 \n"
+               "punpcklbw   %%xmm5,%%xmm2                 \n"
+               "punpckhbw   %%xmm5,%%xmm3                 \n"
+               "paddusw     %%xmm2,%%xmm0                 \n"
+               "paddusw     %%xmm3,%%xmm1                 \n"
+               "movdqu      %%xmm0,(%1)                   \n"
+               "movdqu      %%xmm1,0x10(%1)               \n"
+               "lea         0x20(%1),%1                   \n"
+               "sub         $0x10,%2                      \n"
+               "jg          1b                            \n"
                : "+r"(src_ptr),   // %0
                  "+r"(dst_ptr),   // %1
                  "+r"(src_width)  // %2
@@ -1790,23 +1790,23 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+  asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
                LABELALIGN
-      "1:          \n"
-      "vmovdqu     (%0),%%ymm3                   \n"
-      "lea         0x20(%0),%0                   \n"  // src_ptr += 32
-      "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
-      "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
-      "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
-      "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
-      "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
-      "vmovdqu     %%ymm0,(%1)                   \n"
-      "vmovdqu     %%ymm1,0x20(%1)               \n"
-      "lea         0x40(%1),%1                   \n"
-      "sub         $0x20,%2                      \n"
-      "jg          1b                            \n"
-      "vzeroupper  \n"
+               "1:          \n"
+               "vmovdqu     (%0),%%ymm3                   \n"
+               "lea         0x20(%0),%0                   \n"  // src_ptr += 32
+               "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
+               "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
+               "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
+               "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
+               "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
+               "vmovdqu     %%ymm0,(%1)                   \n"
+               "vmovdqu     %%ymm1,0x20(%1)               \n"
+               "lea         0x40(%1),%1                   \n"
+               "sub         $0x20,%2                      \n"
+               "jg          1b                            \n"
+               "vzeroupper  \n"
                : "+r"(src_ptr),   // %0
                  "+r"(dst_ptr),   // %1
                  "+r"(src_width)  // %2
diff --git a/source/scale_win.cc b/source/scale_win.cc
index 870ed77b3..4b7fd3590 100644
--- a/source/scale_win.cc
+++ b/source/scale_win.cc
@@ -104,7 +104,7 @@ __declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8          // isolate odd pixels.
+    psrlw      xmm0, 8       // isolate odd pixels.
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -138,7 +138,7 @@ __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
     lea        eax,  [eax + 32]
     pmaddubsw  xmm0, xmm4  // horizontal add
     pmaddubsw  xmm1, xmm4
-    pavgw      xmm0, xmm5       // (x + 1) / 2
+    pavgw      xmm0, xmm5    // (x + 1) / 2
     pavgw      xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -213,7 +213,7 @@ __declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
     vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
     vpsrlw      ymm1, ymm1, 8
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8    // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -249,7 +249,7 @@ __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
     vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
     vpavgw      ymm1, ymm1, ymm5
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8    // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -319,7 +319,7 @@ __declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
     // src_stride ignored
     mov        edx, [esp + 12]  // dst_ptr
     mov        ecx, [esp + 16]  // dst_width
-    pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
+    pcmpeqb    xmm5, xmm5     // generate mask 0x00ff0000
     psrld      xmm5, 24
     pslld      xmm5, 16
 
@@ -424,7 +424,7 @@ __declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
     vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
     vpsrlw      ymm0, ymm0, 8
     vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8    // unmutate vpackuswb
     vmovdqu     [edx], xmm0
     lea         edx, [edx + 16]
     sub         ecx, 16
@@ -687,7 +687,7 @@ __declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
     pshufb     xmm1, xmm5
     paddusb    xmm0, xmm1
 
-    movq       qword ptr [edx], xmm0       // write 12 pixels
+    movq       qword ptr [edx], xmm0    // write 12 pixels
     movhlps    xmm1, xmm0
     movd       [edx + 8], xmm1
     lea        edx, [edx + 12]
@@ -1030,7 +1030,7 @@ __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm0
     shufps     xmm0, xmm1, 0x88  // even pixels
-    shufps     xmm2, xmm1, 0xdd       // odd pixels
+    shufps     xmm2, xmm1, 0xdd    // odd pixels
     pavgb      xmm0, xmm2
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
@@ -1216,7 +1216,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
     test       ecx, 2
     je         xloop29
 
-        // 2 Pixels.
+         // 2 Pixels.
     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
     pextrw     eax, xmm2, 5  // get x2 integer.
@@ -1229,7 +1229,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
     test       ecx, 1
     je         xloop99
 
-        // 1 Pixels.
+         // 1 Pixels.
     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
     movd       dword ptr [edi], xmm0
  xloop99:
diff --git a/unit_test/color_test.cc b/unit_test/color_test.cc
index 3048f728a..24456a524 100644
--- a/unit_test/color_test.cc
+++ b/unit_test/color_test.cc
@@ -464,8 +464,7 @@ static void YUVFToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
 static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
   double y1 = (y - 16) * 1.164384;
   *r = RoundToByte(y1 - (v - 128) * -1.67867);
-  *g = RoundToByte(y1 - (u - 128) * 0.187326 -
-                   (v - 128) * 0.65042);
+  *g = RoundToByte(y1 - (u - 128) * 0.187326 - (v - 128) * 0.65042);
   *b = RoundToByte(y1 - (u - 128) * -2.14177);
 }
 
diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc
index d7776c479..7f545a435 100644
--- a/unit_test/convert_argb_test.cc
+++ b/unit_test/convert_argb_test.cc
@@ -53,9 +53,9 @@ namespace libyuv {
 #define ABGRToABGR ARGBCopy
 
 // subsample amount uses a divide.
-#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
+#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
 
-#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN) - 1) / (ALIGN) * (ALIGN))
 
 #define TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
@@ -82,15 +82,19 @@ namespace libyuv {
         (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                    \
     const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);    \
     const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y);  \
-    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF);  \
+    align_buffer_page_end(src_y,                                               \
+                          kPaddedWidth * kPaddedHeight * SRC_BPC + OFF);       \
     align_buffer_page_end(                                                     \
-        src_uv, kSrcHalfPaddedWidth* kSrcHalfPaddedHeight* SRC_BPC * 2 + OFF); \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                  \
-    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);    \
-    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);    \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);                \
-    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC);  \
-    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC);  \
+        src_uv,                                                                \
+        kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC * 2 + OFF);       \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC);                \
+    align_buffer_page_end(dst_u_c, kDstHalfWidth * kDstHalfHeight * DST_BPC);  \
+    align_buffer_page_end(dst_v_c, kDstHalfWidth * kDstHalfHeight * DST_BPC);  \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC);              \
+    align_buffer_page_end(dst_u_opt,                                           \
+                          kDstHalfWidth * kDstHalfHeight * DST_BPC);           \
+    align_buffer_page_end(dst_v_opt,                                           \
+                          kDstHalfWidth * kDstHalfHeight * DST_BPC);           \
     SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                    \
     SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                  \
     for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) {                   \
@@ -101,12 +105,12 @@ namespace libyuv {
       src_uv_p[i] =                                                            \
           (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
     }                                                                          \
-    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                              \
-    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);                \
-    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);                \
-    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                          \
-    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);            \
-    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);            \
+    memset(dst_y_c, 1, kWidth * kHeight * DST_BPC);                            \
+    memset(dst_u_c, 2, kDstHalfWidth * kDstHalfHeight * DST_BPC);              \
+    memset(dst_v_c, 3, kDstHalfWidth * kDstHalfHeight * DST_BPC);              \
+    memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC);                        \
+    memset(dst_u_opt, 102, kDstHalfWidth * kDstHalfHeight * DST_BPC);          \
+    memset(dst_v_opt, 103, kDstHalfWidth * kDstHalfHeight * DST_BPC);          \
     MaskCpuFlags(disable_cpu_flags_);                                          \
     SRC_FMT_PLANAR##To##FMT_PLANAR(                                            \
         src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2,                          \
@@ -223,11 +227,11 @@ TESTBPTOP(P012, uint16_t, 2, 2, 2, I012, uint16_t, 2, 2, 2, 12, 1, 1)
     const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                     \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
     align_buffer_page_end(src_u, kSizeUV + OFF);                              \
     align_buffer_page_end(src_v, kSizeUV + OFF);                              \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);             \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);            \
     for (int i = 0; i < kWidth * kHeight; ++i) {                              \
       src_y[i + OFF] = (fastrand() & 0xff);                                   \
     }                                                                         \
@@ -381,58 +385,58 @@ TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
 #endif
 
-#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
-                   W1280, N, NEG, OFF)                                         \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
-    const int kWidth = W1280;                                                  \
-    const int kHeight = benchmark_height_;                                     \
-    const int kStrideB = kWidth * BPP_B;                                       \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(src_uv,                                              \
-                          kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight);                      \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight);                    \
-    for (int i = 0; i < kHeight; ++i)                                          \
-      for (int j = 0; j < kWidth; ++j)                                         \
-        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                     \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
-      for (int j = 0; j < kStrideUV * 2; ++j) {                                \
-        src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff);             \
-      }                                                                        \
-    }                                                                          \
-    memset(dst_argb_c, 1, kStrideB* kHeight);                                  \
-    memset(dst_argb_opt, 101, kStrideB* kHeight);                              \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,    \
-                          dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight);    \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,  \
-                            dst_argb_opt, kWidth * BPP_B, kWidth,              \
-                            NEG kHeight);                                      \
-    }                                                                          \
-    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */    \
-    align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight);                 \
-    align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight);               \
-    memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                             \
-    memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                         \
-    FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,      \
-                  kHeight);                                                    \
-    FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth,  \
-                  kHeight);                                                    \
-    for (int i = 0; i < kHeight; ++i) {                                        \
-      for (int j = 0; j < kWidth * 4; ++j) {                                   \
-        ASSERT_EQ(dst_argb32_c[i * kWidth * 4 + j],                            \
-                  dst_argb32_opt[i * kWidth * 4 + j]);                         \
-      }                                                                        \
-    }                                                                          \
-    free_aligned_buffer_page_end(src_y);                                       \
-    free_aligned_buffer_page_end(src_uv);                                      \
-    free_aligned_buffer_page_end(dst_argb_c);                                  \
-    free_aligned_buffer_page_end(dst_argb_opt);                                \
-    free_aligned_buffer_page_end(dst_argb32_c);                                \
-    free_aligned_buffer_page_end(dst_argb32_opt);                              \
+#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,     \
+                   W1280, N, NEG, OFF)                                        \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                       \
+    const int kWidth = W1280;                                                 \
+    const int kHeight = benchmark_height_;                                    \
+    const int kStrideB = kWidth * BPP_B;                                      \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
+    align_buffer_page_end(                                                    \
+        src_uv, kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF);         \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight);                    \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight);                  \
+    for (int i = 0; i < kHeight; ++i)                                         \
+      for (int j = 0; j < kWidth; ++j)                                        \
+        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                    \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
+      for (int j = 0; j < kStrideUV * 2; ++j) {                               \
+        src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff);            \
+      }                                                                       \
+    }                                                                         \
+    memset(dst_argb_c, 1, kStrideB * kHeight);                                \
+    memset(dst_argb_opt, 101, kStrideB * kHeight);                            \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2,   \
+                          dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight);   \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \
+                            dst_argb_opt, kWidth * BPP_B, kWidth,             \
+                            NEG kHeight);                                     \
+    }                                                                         \
+    /* Convert to ARGB so 565 is expanded to bytes that can be compared. */   \
+    align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight);                \
+    align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight);              \
+    memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                            \
+    memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                        \
+    FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,     \
+                  kHeight);                                                   \
+    FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \
+                  kHeight);                                                   \
+    for (int i = 0; i < kHeight; ++i) {                                       \
+      for (int j = 0; j < kWidth * 4; ++j) {                                  \
+        ASSERT_EQ(dst_argb32_c[i * kWidth * 4 + j],                           \
+                  dst_argb32_opt[i * kWidth * 4 + j]);                        \
+      }                                                                       \
+    }                                                                         \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_uv);                                     \
+    free_aligned_buffer_page_end(dst_argb_c);                                 \
+    free_aligned_buffer_page_end(dst_argb_opt);                               \
+    free_aligned_buffer_page_end(dst_argb32_c);                               \
+    free_aligned_buffer_page_end(dst_argb32_opt);                             \
   }
 
 #if defined(ENABLE_FULL_TESTS)
@@ -507,15 +511,16 @@ TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2)
     const int kStrideB =                                                       \
         (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                 \
     align_buffer_page_end(src_argb,                                            \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeightB*(int)sizeof(TYPE_B)); \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);    \
+    align_buffer_page_end(dst_argb_c,                                          \
+                          kStrideB * kHeightB * (int)sizeof(TYPE_B));          \
     align_buffer_page_end(dst_argb_opt,                                        \
-                          kStrideB* kHeightB*(int)sizeof(TYPE_B));             \
+                          kStrideB * kHeightB * (int)sizeof(TYPE_B));          \
     for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
       src_argb[i + OFF] = (fastrand() & 0xff);                                 \
     }                                                                          \
-    memset(dst_argb_c, 1, kStrideB* kHeightB);                                 \
-    memset(dst_argb_opt, 101, kStrideB* kHeightB);                             \
+    memset(dst_argb_c, 1, kStrideB * kHeightB);                                \
+    memset(dst_argb_opt, 101, kStrideB * kHeightB);                            \
     MaskCpuFlags(disable_cpu_flags_);                                          \
     FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_B*)dst_argb_c, \
                      kStrideB, kWidth, NEG kHeight);                           \
@@ -532,41 +537,42 @@ TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2)
     free_aligned_buffer_page_end(dst_argb_opt);                                \
   }
 
-#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B,        \
-                       TYPE_B, EPP_B, STRIDE_B, HEIGHT_B)                      \
-  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                       \
-    for (int times = 0; times < benchmark_iterations_; ++times) {              \
-      const int kWidth = (fastrand() & 63) + 1;                                \
-      const int kHeight = (fastrand() & 31) + 1;                               \
-      const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;     \
-      const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;     \
-      const int kStrideA =                                                     \
-          (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
-      const int kStrideB =                                                     \
-          (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
-      align_buffer_page_end(src_argb, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
-      align_buffer_page_end(dst_argb_c,                                        \
-                            kStrideB* kHeightB*(int)sizeof(TYPE_B));           \
-      align_buffer_page_end(dst_argb_opt,                                      \
-                            kStrideB* kHeightB*(int)sizeof(TYPE_B));           \
-      for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {    \
-        src_argb[i] = 0xfe;                                                    \
-      }                                                                        \
-      memset(dst_argb_c, 123, kStrideB* kHeightB);                             \
-      memset(dst_argb_opt, 123, kStrideB* kHeightB);                           \
-      MaskCpuFlags(disable_cpu_flags_);                                        \
-      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c,       \
-                       kStrideB, kWidth, kHeight);                             \
-      MaskCpuFlags(benchmark_cpu_info_);                                       \
-      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt,     \
-                       kStrideB, kWidth, kHeight);                             \
-      for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) {    \
-        ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                             \
-      }                                                                        \
-      free_aligned_buffer_page_end(src_argb);                                  \
-      free_aligned_buffer_page_end(dst_argb_c);                                \
-      free_aligned_buffer_page_end(dst_argb_opt);                              \
-    }                                                                          \
+#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B,     \
+                       TYPE_B, EPP_B, STRIDE_B, HEIGHT_B)                   \
+  TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) {                    \
+    for (int times = 0; times < benchmark_iterations_; ++times) {           \
+      const int kWidth = (fastrand() & 63) + 1;                             \
+      const int kHeight = (fastrand() & 31) + 1;                            \
+      const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;  \
+      const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;  \
+      const int kStrideA =                                                  \
+          (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;            \
+      const int kStrideB =                                                  \
+          (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;            \
+      align_buffer_page_end(src_argb,                                       \
+                            kStrideA * kHeightA * (int)sizeof(TYPE_A));     \
+      align_buffer_page_end(dst_argb_c,                                     \
+                            kStrideB * kHeightB * (int)sizeof(TYPE_B));     \
+      align_buffer_page_end(dst_argb_opt,                                   \
+                            kStrideB * kHeightB * (int)sizeof(TYPE_B));     \
+      for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \
+        src_argb[i] = 0xfe;                                                 \
+      }                                                                     \
+      memset(dst_argb_c, 123, kStrideB * kHeightB);                         \
+      memset(dst_argb_opt, 123, kStrideB * kHeightB);                       \
+      MaskCpuFlags(disable_cpu_flags_);                                     \
+      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c,    \
+                       kStrideB, kWidth, kHeight);                          \
+      MaskCpuFlags(benchmark_cpu_info_);                                    \
+      FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt,  \
+                       kStrideB, kWidth, kHeight);                          \
+      for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) { \
+        ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]);                          \
+      }                                                                     \
+      free_aligned_buffer_page_end(src_argb);                               \
+      free_aligned_buffer_page_end(dst_argb_c);                             \
+      free_aligned_buffer_page_end(dst_argb_opt);                           \
+    }                                                                       \
   }
 
 #if defined(ENABLE_FULL_TESTS)
@@ -672,11 +678,11 @@ TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
     const int kStrideB =                                                      \
         (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                \
     align_buffer_page_end(src_argb,                                           \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);   \
     align_buffer_page_end(dst_argb_c,                                         \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);   \
     align_buffer_page_end(dst_argb_opt,                                       \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);      \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);   \
     for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {     \
       src_argb[i + OFF] = (fastrand() & 0xff);                                \
     }                                                                         \
@@ -791,14 +797,14 @@ TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
         (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
     const int kStrideB =                                                     \
         (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
-    align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF);               \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);                   \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);                 \
+    align_buffer_page_end(src_argb, kStrideA * kHeightA + OFF);              \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                  \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                \
     for (int i = 0; i < kStrideA * kHeightA; ++i) {                          \
       src_argb[i + OFF] = (fastrand() & 0xff);                               \
     }                                                                        \
-    memset(dst_argb_c, 1, kStrideB* kHeightB);                               \
-    memset(dst_argb_opt, 101, kStrideB* kHeightB);                           \
+    memset(dst_argb_c, 1, kStrideB * kHeightB);                              \
+    memset(dst_argb_opt, 101, kStrideB * kHeightB);                          \
     MaskCpuFlags(disable_cpu_flags_);                                        \
     FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, \
                              NULL, kWidth, NEG kHeight);                     \
@@ -827,14 +833,14 @@ TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1)
           (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;               \
       const int kStrideB =                                                     \
           (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;               \
-      align_buffer_page_end(src_argb, kStrideA* kHeightA);                     \
-      align_buffer_page_end(dst_argb_c, kStrideB* kHeightB);                   \
-      align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB);                 \
+      align_buffer_page_end(src_argb, kStrideA * kHeightA);                    \
+      align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                  \
+      align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                \
       for (int i = 0; i < kStrideA * kHeightA; ++i) {                          \
         src_argb[i] = (fastrand() & 0xff);                                     \
       }                                                                        \
-      memset(dst_argb_c, 123, kStrideB* kHeightB);                             \
-      memset(dst_argb_opt, 123, kStrideB* kHeightB);                           \
+      memset(dst_argb_c, 123, kStrideB * kHeightB);                            \
+      memset(dst_argb_opt, 123, kStrideB * kHeightB);                          \
       MaskCpuFlags(disable_cpu_flags_);                                        \
       FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_c, kStrideB, NULL, \
                                kWidth, kHeight);                               \
@@ -885,15 +891,16 @@ TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
     const int kStrideA =                                                       \
         (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                 \
     align_buffer_page_end(src_argb,                                            \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
-    align_buffer_page_end(dst_argb_c, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);    \
+    align_buffer_page_end(dst_argb_c,                                          \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A));          \
     align_buffer_page_end(dst_argb_opt,                                        \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A));             \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A));          \
     for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
       src_argb[i + OFF] = (fastrand() & 0xff);                                 \
     }                                                                          \
-    memset(dst_argb_c, 1, kStrideA* kHeightA);                                 \
-    memset(dst_argb_opt, 101, kStrideA* kHeightA);                             \
+    memset(dst_argb_c, 1, kStrideA * kHeightA);                                \
+    memset(dst_argb_opt, 101, kStrideA * kHeightA);                            \
     MaskCpuFlags(disable_cpu_flags_);                                          \
     FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_c,         \
              kStrideA, kWidth, NEG kHeight);                                   \
@@ -945,12 +952,12 @@ TESTEND(AB64ToAR64, uint16_t, 4, 4, 1)
     const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                      \
     align_buffer_page_end(src_u, kSizeUV + OFF);                               \
     align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_a, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
+    align_buffer_page_end(src_a, kWidth * kHeight + OFF);                      \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);             \
     for (int i = 0; i < kWidth * kHeight; ++i) {                               \
       src_y[i + OFF] = (fastrand() & 0xff);                                    \
       src_a[i + OFF] = (fastrand() & 0xff);                                    \
@@ -1240,11 +1247,11 @@ TEST_F(LibYUVConvertTest, TestDither) {
     const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                      \
     align_buffer_page_end(src_u, kSizeUV + OFF);                               \
     align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);             \
     for (int i = 0; i < kWidth * kHeight; ++i) {                               \
       src_y[i + OFF] = (fastrand() & 0xff);                                    \
     }                                                                          \
@@ -1265,10 +1272,10 @@ TEST_F(LibYUVConvertTest, TestDither) {
           dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight);            \
     }                                                                          \
     /* Convert to ARGB so 565 is expanded to bytes that can be compared. */    \
-    align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight);               \
-    align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight);             \
-    memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight);                           \
-    memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight);                       \
+    align_buffer_page_end(dst_argb32_c, kWidth * BPP_C * kHeight);             \
+    align_buffer_page_end(dst_argb32_opt, kWidth * BPP_C * kHeight);           \
+    memset(dst_argb32_c, 2, kWidth * BPP_C * kHeight);                         \
+    memset(dst_argb32_opt, 102, kWidth * BPP_C * kHeight);                     \
     FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \
                      kWidth, kHeight);                                         \
     FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt,             \
@@ -1317,10 +1324,10 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
     const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                    \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
     align_buffer_page_end(src_u, kSizeUV + OFF);                              \
     align_buffer_page_end(src_v, kSizeUV + OFF);                              \
-    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF);              \
     for (int i = 0; i < kWidth * kHeight; ++i) {                              \
       src_y[i + OFF] = (fastrand() & 0xff);                                   \
     }                                                                         \
@@ -1334,8 +1341,8 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4)
                           kWidth, NEG kHeight);                               \
     /* Convert to a 3rd format in 1 step and 2 steps and compare  */          \
     const int kStrideC = kWidth * BPP_C;                                      \
-    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF);             \
     memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                          \
     memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                         \
     for (int i = 0; i < benchmark_iterations_; ++i) {                         \
@@ -1464,14 +1471,14 @@ TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4)
     const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                     \
     const int kSizeUV =                                                        \
         SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y);          \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                      \
     align_buffer_page_end(src_u, kSizeUV + OFF);                               \
     align_buffer_page_end(src_v, kSizeUV + OFF);                               \
-    align_buffer_page_end(src_a, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(src_a, kWidth * kHeight + OFF);                      \
+    align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF);               \
     const int kStrideC = kWidth * BPP_C;                                       \
-    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);                \
-    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF);              \
     memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                           \
     memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                           \
     memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                          \
@@ -1578,16 +1585,16 @@ TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
     const int kHeight = benchmark_height_;                                     \
     const int kStrideA = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
     const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B;                     \
-    align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF);                \
-    align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(src_argb_a, kStrideA * kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF);               \
     MemRandomize(src_argb_a + OFF, kStrideA * kHeight);                        \
     memset(dst_argb_b + OFF, 1, kStrideB * kHeight);                           \
     FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB,   \
                      kWidth, NEG kHeight);                                     \
     /* Convert to a 3rd format in 1 step and 2 steps and compare  */           \
     const int kStrideC = kWidth * BPP_C;                                       \
-    align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF);                \
-    align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF);              \
     memset(dst_argb_c + OFF, 2, kStrideC * kHeight);                           \
     memset(dst_argb_bc + OFF, 3, kStrideC * kHeight);                          \
     for (int i = 0; i < benchmark_iterations_; ++i) {                          \
@@ -1798,11 +1805,11 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
     const int kBpc = 2;                                                       \
-    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);               \
-    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                       \
-    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                       \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);              \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);            \
+    align_buffer_page_end(src_y, kWidth * kHeight * kBpc + SOFF);             \
+    align_buffer_page_end(src_u, kSizeUV * kBpc + SOFF);                      \
+    align_buffer_page_end(src_v, kSizeUV * kBpc + SOFF);                      \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + DOFF);             \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + DOFF);           \
     for (int i = 0; i < kWidth * kHeight; ++i) {                              \
       reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & FMT_MASK); \
     }                                                                         \
@@ -1913,12 +1920,12 @@ TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30Filter, 4, 4, 1)
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
     const int kBpc = 2;                                                        \
-    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF);                 \
-    align_buffer_page_end(src_u, kSizeUV* kBpc + OFF);                         \
-    align_buffer_page_end(src_v, kSizeUV* kBpc + OFF);                         \
-    align_buffer_page_end(src_a, kWidth* kHeight* kBpc + OFF);                 \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
+    align_buffer_page_end(src_y, kWidth * kHeight * kBpc + OFF);               \
+    align_buffer_page_end(src_u, kSizeUV * kBpc + OFF);                        \
+    align_buffer_page_end(src_v, kSizeUV * kBpc + OFF);                        \
+    align_buffer_page_end(src_a, kWidth * kHeight * kBpc + OFF);               \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF);               \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF);             \
     for (int i = 0; i < kWidth * kHeight; ++i) {                               \
       reinterpret_cast<uint16_t*>(src_y + OFF)[i] =                            \
           (fastrand() & ((1 << S_DEPTH) - 1));                                 \
@@ -2146,10 +2153,10 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2;                    \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;         \
     const int kBpc = 2;                                                        \
-    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);                \
-    align_buffer_page_end(src_uv, kSizeUV* kBpc + SOFF);                       \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);               \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);             \
+    align_buffer_page_end(src_y, kWidth * kHeight * kBpc + SOFF);              \
+    align_buffer_page_end(src_uv, kSizeUV * kBpc + SOFF);                      \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight + DOFF);              \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + DOFF);            \
     for (int i = 0; i < kWidth * kHeight; ++i) {                               \
       reinterpret_cast<uint16_t*>(src_y + SOFF)[i] =                           \
           (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH)));                 \
@@ -2831,16 +2838,23 @@ TEST_F(LibYUVConvertTest, TestARGBToUVMatrixRow_Opt) {
         memset(dest_v_c, 0, sizeof(dest_v_c));
         memset(dest_u_opt, 0, sizeof(dest_u_opt));
         memset(dest_v_opt, 0, sizeof(dest_v_opt));
-        
+
         int src_stride = (height == 1) ? 0 : kMaxWidth * 4;
 
-        ARGBToUVMatrixRow_C(&orig_argb_pixels[0], src_stride, &dest_u_c[0], &dest_v_c[0], width, &kArgbI601Constants);
-        ARGBToUVMatrixRow_Any_NEON(&orig_argb_pixels[0], src_stride, &dest_u_opt[0], &dest_v_opt[0], width, &kArgbI601Constants);
+        ARGBToUVMatrixRow_C(&orig_argb_pixels[0], src_stride, &dest_u_c[0],
+                            &dest_v_c[0], width, &kArgbI601Constants);
+        ARGBToUVMatrixRow_Any_NEON(&orig_argb_pixels[0], src_stride,
+                                   &dest_u_opt[0], &dest_v_opt[0], width,
+                                   &kArgbI601Constants);
 
         int half_width = (width + 1) / 2;
         for (int i = 0; i < half_width; ++i) {
-          ASSERT_EQ(dest_u_c[i], dest_u_opt[i]) << "u mismatch at " << i << " width " << width << " height " << height;
-          ASSERT_EQ(dest_v_c[i], dest_v_opt[i]) << "v mismatch at " << i << " width " << width << " height " << height;
+          ASSERT_EQ(dest_u_c[i], dest_u_opt[i])
+              << "u mismatch at " << i << " width " << width << " height "
+              << height;
+          ASSERT_EQ(dest_v_c[i], dest_v_opt[i])
+              << "v mismatch at " << i << " width " << width << " height "
+              << height;
         }
       }
     }
@@ -2903,13 +2917,12 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
   free_aligned_buffer_page_end(dest_argb);
   free_aligned_buffer_page_end(orig_i400);
 }
-#endif // DISABLE_SLOW_TESTS
+#endif  // DISABLE_SLOW_TESTS
 #endif  // !defined(DISABLE_SLOW_TESTS) && \
         // (defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__))
 
 #endif  // !defined(LEAN_TESTS)
 
-
 #define TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
                    SUBSAMP_Y, W1280, N, NEG, OFF)                              \
   TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) {                             \
@@ -2922,17 +2935,17 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2;                    \
     const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
     align_buffer_page_end(src_argb,                                            \
-                          kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF);       \
-    align_buffer_page_end(dst_y_c, kStrideY* kHeight);                         \
+                          kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF);    \
+    align_buffer_page_end(dst_y_c, kStrideY * kHeight);                        \
     align_buffer_page_end(dst_uv_c, kSizeUV);                                  \
-    align_buffer_page_end(dst_y_opt, kStrideY* kHeight);                       \
+    align_buffer_page_end(dst_y_opt, kStrideY * kHeight);                      \
     align_buffer_page_end(dst_uv_opt, kSizeUV);                                \
     for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) {      \
       src_argb[i + OFF] = (fastrand() & 0xff);                                 \
     }                                                                          \
-    memset(dst_y_c, 1, kStrideY* kHeight);                                     \
+    memset(dst_y_c, 1, kStrideY * kHeight);                                    \
     memset(dst_uv_c, 2, kSizeUV);                                              \
-    memset(dst_y_opt, 101, kStrideY* kHeight);                                 \
+    memset(dst_y_opt, 101, kStrideY * kHeight);                                \
     memset(dst_uv_opt, 102, kSizeUV);                                          \
     MaskCpuFlags(disable_cpu_flags_);                                          \
     FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, dst_y_c, kStrideY,   \
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 9b7cc85d9..a38e7fdf9 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -51,9 +51,9 @@ namespace libyuv {
 #define ABGRToABGR ARGBCopy
 
 // subsample amount uses a divide.
-#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
+#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
 
-#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN) - 1) / (ALIGN) * (ALIGN))
 
 // Planar test
 
@@ -78,17 +78,19 @@ namespace libyuv {
     const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
     const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
     const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
+    align_buffer_page_end(src_y, kWidth * kHeight * SRC_BPC + OFF);           \
     align_buffer_page_end(src_u,                                              \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+                          kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF);    \
     align_buffer_page_end(src_v,                                              \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
-    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
-    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
-    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
-    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+                          kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF);    \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC);               \
+    align_buffer_page_end(dst_u_c, kDstHalfWidth * kDstHalfHeight * DST_BPC); \
+    align_buffer_page_end(dst_v_c, kDstHalfWidth * kDstHalfHeight * DST_BPC); \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC);             \
+    align_buffer_page_end(dst_u_opt,                                          \
+                          kDstHalfWidth * kDstHalfHeight * DST_BPC);          \
+    align_buffer_page_end(dst_v_opt,                                          \
+                          kDstHalfWidth * kDstHalfHeight * DST_BPC);          \
     MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
     MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
     MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
@@ -102,12 +104,12 @@ namespace libyuv {
       src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1);                       \
       src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1);                       \
     }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
-    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
-    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
-    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
-    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
-    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
+    memset(dst_y_c, 1, kWidth * kHeight * DST_BPC);                           \
+    memset(dst_u_c, 2, kDstHalfWidth * kDstHalfHeight * DST_BPC);             \
+    memset(dst_v_c, 3, kDstHalfWidth * kDstHalfHeight * DST_BPC);             \
+    memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC);                       \
+    memset(dst_u_opt, 102, kDstHalfWidth * kDstHalfHeight * DST_BPC);         \
+    memset(dst_v_opt, 103, kDstHalfWidth * kDstHalfHeight * DST_BPC);         \
     MaskCpuFlags(disable_cpu_flags_);                                         \
     SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
         src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,      \
@@ -212,15 +214,15 @@ TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)
     const int kHeight = benchmark_height_;                                    \
     const int kSizeUV =                                                       \
         SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
     align_buffer_page_end(src_uv,                                             \
-                          kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);       \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
+                          kSizeUV * ((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);     \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight);                         \
     align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
                                        SUBSAMPLE(kHeight, SUBSAMP_Y));        \
     align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
                                        SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                       \
     align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
                                          SUBSAMPLE(kHeight, SUBSAMP_Y));      \
     align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
@@ -239,12 +241,12 @@ TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)
             (fastrand() & 0xff);                                              \
       }                                                                       \
     }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight);                                      \
+    memset(dst_y_c, 1, kWidth * kHeight);                                     \
     memset(dst_u_c, 2,                                                        \
            SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
     memset(dst_v_c, 3,                                                        \
            SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
+    memset(dst_y_opt, 101, kWidth * kHeight);                                 \
     memset(dst_u_opt, 102,                                                    \
            SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
     memset(dst_v_opt, 103,                                                    \
@@ -359,17 +361,17 @@ static int I400ToNV21(const uint8_t* src_y,
     const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
     const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
     const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
+    align_buffer_page_end(src_y, kWidth * kHeight * SRC_BPC + OFF);           \
     align_buffer_page_end(src_u,                                              \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+                          kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF);    \
     align_buffer_page_end(src_v,                                              \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+                          kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF);    \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC);               \
     align_buffer_page_end(dst_uv_c,                                           \
-                          kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);        \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+                          kDstHalfWidth * kDstHalfHeight * DST_BPC * 2);      \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC);             \
     align_buffer_page_end(dst_uv_opt,                                         \
-                          kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);        \
+                          kDstHalfWidth * kDstHalfHeight * DST_BPC * 2);      \
     MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
     MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
     MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
@@ -383,10 +385,10 @@ static int I400ToNV21(const uint8_t* src_y,
       src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1);                       \
       src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1);                       \
     }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
-    memset(dst_uv_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);          \
-    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
-    memset(dst_uv_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);      \
+    memset(dst_y_c, 1, kWidth * kHeight * DST_BPC);                           \
+    memset(dst_uv_c, 2, kDstHalfWidth * kDstHalfHeight * DST_BPC * 2);        \
+    memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC);                       \
+    memset(dst_uv_opt, 102, kDstHalfWidth * kDstHalfHeight * DST_BPC * 2);    \
     MaskCpuFlags(disable_cpu_flags_);                                         \
     SRC_FMT_PLANAR##To##FMT_PLANAR(src_y_p, kWidth, src_u_p, kSrcHalfWidth,   \
                                    src_v_p, kSrcHalfWidth,                    \
@@ -478,14 +480,15 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
         (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                   \
     const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);   \
     const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \
-    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \
+    align_buffer_page_end(src_y,                                              \
+                          kPaddedWidth * kPaddedHeight * SRC_BPC + OFF);      \
     align_buffer_page_end(                                                    \
         src_uv,                                                               \
         2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF);      \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC);               \
     align_buffer_page_end(dst_uv_c,                                           \
                           2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);      \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC);             \
     align_buffer_page_end(dst_uv_opt,                                         \
                           2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);      \
     SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
@@ -502,13 +505,13 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
       src_uv_p[i] =                                                           \
           (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));      \
     }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_y_c, 1, kWidth * kHeight * DST_BPC);                           \
     memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);        \
-    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC);                       \
     memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);    \
     MaskCpuFlags(disable_cpu_flags_);                                         \
     SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
-        src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p,              \
+        src_y_p, kWidth * SRC_BPC / (int)sizeof(SRC_T), src_uv_p,             \
         2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T),                     \
         DOY ? reinterpret_cast<DST_T*>(dst_y_c) : NULL, kWidth,               \
         reinterpret_cast<DST_T*>(dst_uv_c), 2 * kDstHalfWidth, kWidth,        \
@@ -516,7 +519,7 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
     MaskCpuFlags(benchmark_cpu_info_);                                        \
     for (int i = 0; i < benchmark_iterations_; ++i) {                         \
       SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
-          src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p,            \
+          src_y_p, kWidth * SRC_BPC / (int)sizeof(SRC_T), src_uv_p,           \
           2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T),                   \
           DOY ? reinterpret_cast<DST_T*>(dst_y_opt) : NULL, kWidth,           \
           reinterpret_cast<DST_T*>(dst_uv_opt), 2 * kDstHalfWidth, kWidth,    \
@@ -598,16 +601,16 @@ TESTBPTOBP(P010, uint16_t, 2, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1)
     const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
     const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
-    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
+    align_buffer_page_end(src_argb, kStride * kHeight + OFF);                  \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight);                          \
     align_buffer_page_end(dst_uv_c,                                            \
                           kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                        \
     align_buffer_page_end(dst_uv_opt,                                          \
                           kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    memset(dst_y_c, 1, kWidth* kHeight);                                       \
+    memset(dst_y_c, 1, kWidth * kHeight);                                      \
     memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                   \
+    memset(dst_y_opt, 101, kWidth * kHeight);                                  \
     memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));    \
     for (int i = 0; i < kHeight; ++i)                                          \
       for (int j = 0; j < kStride; ++j)                                        \
@@ -691,20 +694,20 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1)
     const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
     const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8;           \
-    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                   \
-    align_buffer_page_end(dst_a_c, kWidth* kHeight);                           \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
+    align_buffer_page_end(src_argb, kStride * kHeight + OFF);                  \
+    align_buffer_page_end(dst_a_c, kWidth * kHeight);                          \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight);                          \
     align_buffer_page_end(dst_uv_c,                                            \
                           kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    align_buffer_page_end(dst_a_opt, kWidth* kHeight);                         \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_a_opt, kWidth * kHeight);                        \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                        \
     align_buffer_page_end(dst_uv_opt,                                          \
                           kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    memset(dst_a_c, 1, kWidth* kHeight);                                       \
-    memset(dst_y_c, 2, kWidth* kHeight);                                       \
+    memset(dst_a_c, 1, kWidth * kHeight);                                      \
+    memset(dst_y_c, 2, kWidth * kHeight);                                      \
     memset(dst_uv_c, 3, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    memset(dst_a_opt, 101, kWidth* kHeight);                                   \
-    memset(dst_y_opt, 102, kWidth* kHeight);                                   \
+    memset(dst_a_opt, 101, kWidth * kHeight);                                  \
+    memset(dst_y_opt, 102, kWidth * kHeight);                                  \
     memset(dst_uv_opt, 103, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));    \
     for (int i = 0; i < kHeight; ++i)                                          \
       for (int j = 0; j < kStride; ++j)                                        \
@@ -765,19 +768,19 @@ TESTATOPLANARA(ARGB, 4, 1, I420Alpha, 2, 2)
     const int kHeight = benchmark_height_;                                    \
     const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A;                     \
     const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
-    align_buffer_page_end(src_argb, kStride* kHeight + OFF);                  \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
+    align_buffer_page_end(src_argb, kStride * kHeight + OFF);                 \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight);                         \
     align_buffer_page_end(dst_uv_c,                                           \
                           kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                       \
     align_buffer_page_end(dst_uv_opt,                                         \
                           kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
     for (int i = 0; i < kHeight; ++i)                                         \
       for (int j = 0; j < kStride; ++j)                                       \
         src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff);              \
-    memset(dst_y_c, 1, kWidth* kHeight);                                      \
+    memset(dst_y_c, 1, kWidth * kHeight);                                     \
     memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));       \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
+    memset(dst_y_opt, 101, kWidth * kHeight);                                 \
     memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));   \
     MaskCpuFlags(disable_cpu_flags_);                                         \
     FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \
@@ -1950,17 +1953,17 @@ TEST_F(LibYUVConvertTest, I420CropOddY) {
     const int kHeight = benchmark_height_;                                    \
                                                                               \
     align_buffer_page_end(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight);     \
-    align_buffer_page_end(orig_y, kWidth* kHeight);                           \
+    align_buffer_page_end(orig_y, kWidth * kHeight);                          \
     align_buffer_page_end(orig_u,                                             \
                           SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));      \
     align_buffer_page_end(orig_v,                                             \
                           SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));      \
                                                                               \
-    align_buffer_page_end(dst_y_orig, kWidth* kHeight);                       \
+    align_buffer_page_end(dst_y_orig, kWidth * kHeight);                      \
     align_buffer_page_end(dst_uv_orig,                                        \
                           2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));  \
                                                                               \
-    align_buffer_page_end(dst_y, kWidth* kHeight);                            \
+    align_buffer_page_end(dst_y, kWidth * kHeight);                           \
     align_buffer_page_end(dst_uv,                                             \
                           2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2));  \
                                                                               \
@@ -2287,12 +2290,13 @@ TEST_F(LibYUVConvertTest, TestARGBToI420Matrix) {
                    dst_v, kWidth / 2, &kArgbU2020Constants, kWidth, kHeight);
 
   // Reference BT.709 (limited range)
-  // Y = round(0.2126 * 219 / 255 * R + 0.7152 * 219 / 255 * G + 0.0722 * 219 / 255 * B + 16)
-  // Y = round(0.1826 * R + 0.6142 * G + 0.0620 * B + 16)
-  // 47 * 255 + 157 * 255 + 16 * 255 + 4224 = 11985 + 40035 + 4080 + 4224 = 60324
-  // 60324 / 256 = 235.64 -> 235. Correct.
+  // Y = round(0.2126 * 219 / 255 * R + 0.7152 * 219 / 255 * G + 0.0722 * 219 /
+  // 255 * B + 16) Y = round(0.1826 * R + 0.6142 * G + 0.0620 * B + 16) 47 * 255
+  // + 157 * 255 + 16 * 255 + 4224 = 11985 + 40035 + 4080 + 4224 = 60324 60324 /
+  // 256 = 235.64 -> 235. Correct.
 
-  for (int i = 0; i < kWidth * kHeight * 4; ++i) src_argb[i] = 255;
+  for (int i = 0; i < kWidth * kHeight * 4; ++i)
+    src_argb[i] = 255;
   ARGBToI420Matrix(src_argb, kWidth * 4, dst_y, kWidth, dst_u, kWidth / 2,
                    dst_v, kWidth / 2, &kArgbH709Constants, kWidth, kHeight);
   ASSERT_EQ(dst_y[0], 235);
@@ -2423,6 +2427,132 @@ TEST_F(LibYUVConvertTest, TestARGBToI444Matrix) {
   free_aligned_buffer_page_end(ref_v);
 }
 
+template <typename ConvertToYUV, typename ConvertToARGB>
+static void TestRGBToI420(ConvertToYUV convert_to_yuv,
+                          ConvertToARGB convert_to_argb,
+                          int width,
+                          int height,
+                          int disable_cpu_flags,
+                          int benchmark_cpu_info) {
+  align_buffer_page_end(src_rgb, width * height * 4);
+  align_buffer_page_end(dst_y, width * height);
+  align_buffer_page_end(dst_u, (width + 1) / 2 * (height + 1) / 2);
+  align_buffer_page_end(dst_v, (width + 1) / 2 * (height + 1) / 2);
+
+  align_buffer_page_end(tmp_argb, width * height * 4);
+  align_buffer_page_end(ref_y, width * height);
+  align_buffer_page_end(ref_u, (width + 1) / 2 * (height + 1) / 2);
+  align_buffer_page_end(ref_v, (width + 1) / 2 * (height + 1) / 2);
+
+  MemRandomize(src_rgb, width * height * 4);
+
+  {
+    SCOPED_TRACE("C_Version");
+    MaskCpuFlags(disable_cpu_flags);
+
+    // Clear buffers
+    memset(dst_y, 0, width * height);
+    memset(dst_u, 0, (width + 1) / 2 * (height + 1) / 2);
+    memset(dst_v, 0, (width + 1) / 2 * (height + 1) / 2);
+    memset(ref_y, 0, width * height);
+    memset(ref_u, 0, (width + 1) / 2 * (height + 1) / 2);
+    memset(ref_v, 0, (width + 1) / 2 * (height + 1) / 2);
+    memset(tmp_argb, 0, width * height * 4);
+
+    int r1 =
+        convert_to_yuv(src_rgb, width * 4, dst_y, width, dst_u, (width + 1) / 2,
+                       dst_v, (width + 1) / 2, width, height);
+    ASSERT_EQ(r1, 0);
+
+    int r2 =
+        convert_to_argb(src_rgb, width * 4, tmp_argb, width * 4, width, height);
+    ASSERT_EQ(r2, 0);
+
+    int r3 = ARGBToI420(tmp_argb, width * 4, ref_y, width, ref_u,
+                        (width + 1) / 2, ref_v, (width + 1) / 2, width, height);
+    ASSERT_EQ(r3, 0);
+
+    for (int i = 0; i < width * height; ++i) {
+      ASSERT_EQ(dst_y[i], ref_y[i]);
+    }
+    for (int i = 0; i < (width + 1) / 2 * (height + 1) / 2; ++i) {
+      ASSERT_EQ(dst_u[i], ref_u[i]);
+      ASSERT_EQ(dst_v[i], ref_v[i]);
+    }
+  }
+
+  {
+    SCOPED_TRACE("SIMD_Version");
+    MaskCpuFlags(benchmark_cpu_info);
+
+    // Clear buffers
+    memset(dst_y, 0, width * height);
+    memset(dst_u, 0, (width + 1) / 2 * (height + 1) / 2);
+    memset(dst_v, 0, (width + 1) / 2 * (height + 1) / 2);
+    memset(ref_y, 0, width * height);
+    memset(ref_u, 0, (width + 1) / 2 * (height + 1) / 2);
+    memset(ref_v, 0, (width + 1) / 2 * (height + 1) / 2);
+    memset(tmp_argb, 0, width * height * 4);
+
+    int r1 =
+        convert_to_yuv(src_rgb, width * 4, dst_y, width, dst_u, (width + 1) / 2,
+                       dst_v, (width + 1) / 2, width, height);
+    ASSERT_EQ(r1, 0);
+
+    int r2 =
+        convert_to_argb(src_rgb, width * 4, tmp_argb, width * 4, width, height);
+    ASSERT_EQ(r2, 0);
+
+    int r3 = ARGBToI420(tmp_argb, width * 4, ref_y, width, ref_u,
+                        (width + 1) / 2, ref_v, (width + 1) / 2, width, height);
+    ASSERT_EQ(r3, 0);
+
+    for (int i = 0; i < width * height; ++i) {
+      ASSERT_EQ(dst_y[i], ref_y[i]);
+    }
+    for (int i = 0; i < (width + 1) / 2 * (height + 1) / 2; ++i) {
+      ASSERT_EQ(dst_u[i], ref_u[i]);
+      ASSERT_EQ(dst_v[i], ref_v[i]);
+    }
+  }
+
+  free_aligned_buffer_page_end(src_rgb);
+  free_aligned_buffer_page_end(dst_y);
+  free_aligned_buffer_page_end(dst_u);
+  free_aligned_buffer_page_end(dst_v);
+  free_aligned_buffer_page_end(tmp_argb);
+  free_aligned_buffer_page_end(ref_y);
+  free_aligned_buffer_page_end(ref_u);
+  free_aligned_buffer_page_end(ref_v);
+}
+
+TEST_F(LibYUVConvertTest, BGRAToI420_Check) {
+  TestRGBToI420(BGRAToI420, BGRAToARGB, 16, 16, disable_cpu_flags_,
+                benchmark_cpu_info_);
+  TestRGBToI420(BGRAToI420, BGRAToARGB, 17, 17, disable_cpu_flags_,
+                benchmark_cpu_info_);
+  TestRGBToI420(BGRAToI420, BGRAToARGB, 1280, 720, disable_cpu_flags_,
+                benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVConvertTest, RGBAToI420_Check) {
+  TestRGBToI420(RGBAToI420, RGBAToARGB, 16, 16, disable_cpu_flags_,
+                benchmark_cpu_info_);
+  TestRGBToI420(RGBAToI420, RGBAToARGB, 17, 17, disable_cpu_flags_,
+                benchmark_cpu_info_);
+  TestRGBToI420(RGBAToI420, RGBAToARGB, 1280, 720, disable_cpu_flags_,
+                benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVConvertTest, ABGRToI420_Check) {
+  TestRGBToI420(ABGRToI420, ABGRToARGB, 16, 16, disable_cpu_flags_,
+                benchmark_cpu_info_);
+  TestRGBToI420(ABGRToI420, ABGRToARGB, 17, 17, disable_cpu_flags_,
+                benchmark_cpu_info_);
+  TestRGBToI420(ABGRToI420, ABGRToARGB, 1280, 720, disable_cpu_flags_,
+                benchmark_cpu_info_);
+}
+
 #endif  // !defined(LEAN_TESTS)
 
 }  // namespace libyuv
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index d37001f1b..7eba494b7 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -1212,10 +1212,10 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane_16) {
         (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;                \
     const int kStrideB =                                                      \
         (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;                \
-    align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF);               \
-    align_buffer_page_end(src_argb_b, kStrideA* kHeight + OFF);               \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight);                     \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight);                   \
+    align_buffer_page_end(src_argb_a, kStrideA * kHeight + OFF);              \
+    align_buffer_page_end(src_argb_b, kStrideA * kHeight + OFF);              \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeight);                    \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight);                  \
     for (int i = 0; i < kStrideA * kHeight; ++i) {                            \
       src_argb_a[i + OFF] = (fastrand() & 0xff);                              \
       src_argb_b[i + OFF] = (fastrand() & 0xff);                              \
@@ -1418,7 +1418,7 @@ TEST_F(LibYUVPlanarTest, BlendPlane_Invert) {
                  disable_cpu_flags_, benchmark_cpu_info_, -1, 1);
 }
 
-#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
+#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
 
 static void TestI420Blend(int width,
                           int height,
diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc
index 9256f8de0..10ee64cbc 100644
--- a/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
@@ -20,7 +20,7 @@
 
 namespace libyuv {
 
-#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
+#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
 
 static void I420TestRotate(int src_width,
                            int src_height,
@@ -495,15 +495,15 @@ TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
     const int kHeight = benchmark_height_;                                    \
     const int kSizeUV =                                                       \
         SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_y, kWidth * kHeight + OFF);                     \
     align_buffer_page_end(src_uv,                                             \
-                          kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);       \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
+                          kSizeUV * ((PIXEL_STRIDE == 3) ? 3 : 2) + OFF);     \
+    align_buffer_page_end(dst_y_c, kWidth * kHeight);                         \
     align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
                                        SUBSAMPLE(kHeight, SUBSAMP_Y));        \
     align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *             \
                                        SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
+    align_buffer_page_end(dst_y_opt, kWidth * kHeight);                       \
     align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
                                          SUBSAMPLE(kHeight, SUBSAMP_Y));      \
     align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
@@ -522,12 +522,12 @@ TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) {
             (fastrand() & 0xff);                                              \
       }                                                                       \
     }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight);                                      \
+    memset(dst_y_c, 1, kWidth * kHeight);                                     \
     memset(dst_u_c, 2,                                                        \
            SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
     memset(dst_v_c, 3,                                                        \
            SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
+    memset(dst_y_opt, 101, kWidth * kHeight);                                 \
     memset(dst_u_opt, 102,                                                    \
            SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));     \
     memset(dst_v_opt, 103,                                                    \
diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc
index 219e196dd..3d3e36fc5 100644
--- a/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@@ -431,13 +431,13 @@ static void FillRamp(uint8_t* buf,
 
 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
 static void YUVToARGBTestFilter(int src_width,
-                               int src_height,
-                               int dst_width,
-                               int dst_height,
-                               FilterMode f,
-                               int benchmark_iterations,
-                               int error_threshold,
-                               int* max_diff_out) {
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                FilterMode f,
+                                int benchmark_iterations,
+                                int error_threshold,
+                                int* max_diff_out) {
   int64_t src_y_plane_size = Abs(src_width) * Abs(src_height);
   int64_t src_uv_plane_size =
       ((Abs(src_width) + 1) / 2) * ((Abs(src_height) + 1) / 2);
@@ -448,8 +448,8 @@ static void YUVToARGBTestFilter(int src_width,
   align_buffer_page_end(src_u, src_uv_plane_size);
   align_buffer_page_end(src_v, src_uv_plane_size);
 
-  int64_t dst_argb_plane_size = (dst_width) * (dst_height)*4LL;
-  int dst_stride_argb = (dst_width)*4;
+  int64_t dst_argb_plane_size = (dst_width) * (dst_height) * 4LL;
+  int dst_stride_argb = (dst_width) * 4;
   align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
   align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
   if (!dst_argb_c || !dst_argb_opt || !src_y || !src_u || !src_v) {
@@ -516,10 +516,10 @@ TEST_F(LibYUVScaleTest, YUVToRGBScaleUp) {
 
 TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) {
   int diff = 0;
-  YUVToARGBTestFilter(
-      benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2, benchmark_width_,
-      benchmark_height_, libyuv::kFilterBilinear, benchmark_iterations_, 10,
-      &diff);
+  YUVToARGBTestFilter(benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2,
+                      benchmark_width_, benchmark_height_,
+                      libyuv::kFilterBilinear, benchmark_iterations_, 10,
+                      &diff);
   ASSERT_LE(diff, 10);
 }
 
diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc
index 750e340fa..323094f3f 100644
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -757,7 +757,7 @@ static int NV12TestFilter(int src_width,
   int src_height_uv = (Abs(src_height) + 1) >> 1;
 
   int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
-  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv)*2;
+  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv) * 2;
 
   int src_stride_y = Abs(src_width);
   int src_stride_uv = src_width_uv * 2;
@@ -775,7 +775,7 @@ static int NV12TestFilter(int src_width,
   int dst_height_uv = (dst_height + 1) >> 1;
 
   int64_t dst_y_plane_size = (dst_width) * (dst_height);
-  int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv)*2;
+  int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv) * 2;
 
   int dst_stride_y = dst_width;
   int dst_stride_uv = dst_width_uv * 2;
diff --git a/unit_test/unit_test.h b/unit_test/unit_test.h
index 2c11c983f..e9a55c62f 100644
--- a/unit_test/unit_test.h
+++ b/unit_test/unit_test.h
@@ -85,10 +85,11 @@ static inline bool SizeValid(int src_width,
 #define align_buffer_page_end_16(var, size)                                 \
   uint16_t* var = NULL;                                                     \
   uint8_t* var##_mem =                                                      \
-      reinterpret_cast<uint8_t*>(malloc(((size)*2 + 4095 + 63) & ~4095));   \
+      reinterpret_cast<uint8_t*>(malloc(((size) * 2 + 4095 + 63) & ~4095)); \
   if (var##_mem)                                                            \
   var = reinterpret_cast<uint16_t*>(                                        \
-      (intptr_t)(var##_mem + (((size)*2 + 4095 + 63) & ~4095) - (size)*2) & \
+      (intptr_t)(var##_mem + (((size) * 2 + 4095 + 63) & ~4095) -           \
+                 (size) * 2) &                                              \
       ~63)
 
 #define free_aligned_buffer_page_end_16(var) \
diff --git a/util/ssim.cc b/util/ssim.cc
index 096fbcf06..f8b4509f8 100644
--- a/util/ssim.cc
+++ b/util/ssim.cc
@@ -244,23 +244,23 @@ double GetSSIMFullKernel(const uint8_t* org,
 
 // Read 8 pixels at line #L, and convert to 16bit, perform weighting
 // and acccumulate.
-#define LOAD_LINE_PAIR(L, WEIGHT)                                            \
-  do {                                                                       \
-    const __m128i v0 =                                                       \
-        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \
-    const __m128i v1 =                                                       \
-        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \
-    const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                          \
-    const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                          \
-    const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);            \
-    const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);            \
-    x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                     \
-    y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                     \
-    x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                     \
-    y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                     \
-    xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                         \
-    xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                         \
-    yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                         \
+#define LOAD_LINE_PAIR(L, WEIGHT)                                              \
+  do {                                                                         \
+    const __m128i v0 =                                                         \
+        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
+    const __m128i v1 =                                                         \
+        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
+    const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                            \
+    const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                            \
+    const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);              \
+    const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);              \
+    x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                       \
+    y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                       \
+    x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                       \
+    y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                       \
+    xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                           \
+    xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                           \
+    yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                           \
   } while (0)
 
 #define ADD_AND_STORE_FOUR_EPI32(M, OUT)                    \