From f2ac6db694d1e5b0af1d7b05dc431e0e455fe228 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Wed, 29 Apr 2026 12:54:49 -0700
Subject: [PATCH] RAWToNV21 using SME, SVE, I8MM or Neon

Pixel 9 Now SVE2 2 pass LibYUVConvertTest.RAWToNV21_Opt (364 ms)
 31.76% libyuv::ARGBToUVMatrixRow_SVE_SC()
 30.38% RAWToARGBRow_SVE2
 26.81% ARGBToYMatrixRow_NEON_DotProd
  3.26% MergeUVRow_NEON

Was NEON 1 pass LibYUVConvertTest.RAWToJNV21_Opt (295 ms)
 44.14% RAWToYJRow_NEON
 41.91% RAWToUVJRow_NEON
  5.11% MergeUVRow_NEON

Clang on Intel Skylake clang [ OK ] LibYUVConvertTest.RAWToJNV21_Opt
(301 ms) visual c (row_win) [ OK ] LibYUVConvertTest.RAWToJNV21_Opt
(2056 ms)

clang [ OK ] LibYUVConvertTest.RAWToJNV21_Opt (275 ms) visual c [ OK ]
LibYUVConvertTest.RAWToJNV21_Opt (365 ms)

Bug: libyuv:42280902
Change-Id: Iaba558ebe96ce6b9881ee9335ba72b8aac390cde
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7802432
Commit-Queue: Frank Barchard <fbarchard@google.com>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Reviewed-by: Dale Curtis <dalecurtis@chromium.org>
---
 README.chromium             |  2 +-
 include/libyuv/row.h        | 49 +++++++++++++++++++-
 include/libyuv/version.h    |  2 +-
 source/convert.cc           | 26 +++++++++++
 source/convert_from_argb.cc | 85 ++++++++++++++++++++++++++++++++---
 source/row_any.cc           |  6 +++
 source/row_neon.cc          | 66 +++++++++++++++++++++++++++
 source/row_neon64.cc        | 82 +++++++++++++++++++++++++++-------
 source/row_sme.cc           | 14 ++++++
 source/row_sve.cc           | 13 ++++++
 source/row_win.cc           | 89 +++++++++++++++++++++++++++++++++++++
 11 files changed, 407 insertions(+), 27 deletions(-)

diff --git a/README.chromium b/README.chromium
index 27d03c752..1407f963e 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1933
+Version: 1934
 Revision: DEPS
 License: BSD-3-Clause
 License File: LICENSE
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 3b2c52aaa..40272cf5a 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -140,6 +140,13 @@ extern "C" {
 
 // The following are available on all x86 platforms, but
 // require VS2012, clang 3.4 or gcc 4.7.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || defined(__i386__) || \
+     defined(_M_X64) || defined(_M_X86))
+#define HAS_ARGBTOUVMATRIXROW_AVX2
+#define HAS_MERGEUVROW_AVX2
+#endif
+
 #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
     (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
      defined(GCC_HAS_AVX2))
@@ -163,7 +170,6 @@ extern "C" {
 #define HAS_I444TORGB24ROW_AVX2
 #define HAS_INTERPOLATEROW_AVX2
 #define HAS_J422TOARGBROW_AVX2
-#define HAS_MERGEUVROW_AVX2
 #define HAS_MIRRORROW_AVX2
 #define HAS_NV12TOARGBROW_AVX2
 #define HAS_NV12TORGB24ROW_AVX2
@@ -427,6 +433,7 @@ extern "C" {
 #define HAS_ARGBTOUV444ROW_NEON
 #define HAS_ARGBTOUVJ444ROW_NEON
 #define HAS_ARGBTOUVJROW_NEON
+#define HAS_ARGBTOUVMATRIXROW_NEON
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
 #if !defined(__aarch64__)
@@ -573,6 +580,7 @@ extern "C" {
 #define HAS_ARGBTOUV444ROW_NEON_I8MM
 #define HAS_ARGBTOUVJ444ROW_NEON_I8MM
 #define HAS_ARGBTOUVJROW_NEON_I8MM
+#define HAS_ARGBTOUVMATRIXROW_NEON_I8MM
 #define HAS_ARGBTOUVROW_NEON_I8MM
 #define HAS_BGRATOUVROW_NEON_I8MM
 #define HAS_RGBATOUVROW_NEON_I8MM
@@ -588,6 +596,7 @@ extern "C" {
 #define HAS_ARGBTORGB565DITHERROW_SVE2
 #define HAS_ARGBTORGB565ROW_SVE2
 #define HAS_ARGBTOUVJROW_SVE2
+#define HAS_ARGBTOUVMATRIXROW_SVE2
 #define HAS_ARGBTOUVROW_SVE2
 #define HAS_AYUVTOUVROW_SVE2
 #define HAS_AYUVTOVUROW_SVE2
@@ -639,6 +648,7 @@ extern "C" {
 #define HAS_ABGRTOUVROW_SME
 #define HAS_ARGBMULTIPLYROW_SME
 #define HAS_ARGBTOUVJROW_SME
+#define HAS_ARGBTOUVMATRIXROW_SME
 #define HAS_ARGBTOUVROW_SME
 #define HAS_BGRATOUVROW_SME
 #define HAS_CONVERT16TO8ROW_SME
@@ -1834,6 +1844,43 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
                                uint8_t* dst_u,
                                uint8_t* dst_v,
                                int width);
+void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
+                            int src_stride_argb,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width,
+                            const struct ArgbConstants* c);
+void ARGBToUVMatrixRow_Any_NEON(const uint8_t* src_argb,
+                                int src_stride_argb,
+                                uint8_t* dst_u,
+                                uint8_t* dst_v,
+                                int width,
+                                const struct ArgbConstants* c);
+void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
+                                 int src_stride_argb,
+                                 uint8_t* dst_u,
+                                 uint8_t* dst_v,
+                                 int width,
+                                 const struct ArgbConstants* c);
+void ARGBToUVMatrixRow_Any_NEON_I8MM(const uint8_t* src_argb,
+                                     int src_stride_argb,
+                                     uint8_t* dst_u,
+                                     uint8_t* dst_v,
+                                     int width,
+                                     const struct ArgbConstants* c);
+void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
+                            int src_stride_argb,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width,
+                            const struct ArgbConstants* c);
+void ARGBToUVMatrixRow_SME(const uint8_t* src_argb,
+                           int src_stride_argb,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width,
+                           const struct ArgbConstants* c);
+
 void ARGBToUVRow_NEON(const uint8_t* src_argb,
                       int src_stride_argb,
                       uint8_t* dst_u,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 0b0290a97..b745710eb 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1933
+#define LIBYUV_VERSION 1934
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert.cc b/source/convert.cc
index a1a7ba9bf..d9fb54778 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -2245,6 +2245,32 @@ ARGBToUVMatrixRow_C;
   }
 #endif
 
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+      }
+    }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
+    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+      }
+    }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
+    if (TestCpuFlag(kCpuHasSVE2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+    }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SME)
+    if (TestCpuFlag(kCpuHasSME)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+    }
+#endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index 7e7a4f8cf..2c66611e6 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -574,6 +574,32 @@ ARGBToUVMatrixRow_C;
   }
 #endif
 
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+      }
+    }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
+    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+      }
+    }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
+    if (TestCpuFlag(kCpuHasSVE2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+    }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SME)
+    if (TestCpuFlag(kCpuHasSME)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+    }
+#endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
@@ -915,6 +941,32 @@ ARGBToUVMatrixRow_C;
   }
 #endif
 
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+      }
+    }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
+    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+      }
+    }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
+    if (TestCpuFlag(kCpuHasSVE2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+    }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SME)
+    if (TestCpuFlag(kCpuHasSME)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+    }
+#endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
@@ -4065,13 +4117,6 @@ int ARGBToAB64(const uint8_t* src_argb,
   return 0;
 }
 
-// Enabled if 1 pass is available
-#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_RVV)
-#define HAS_RAWTOYJROW
-#endif
-
-// RAW to JNV21 full range NV21
-LIBYUV_API
 // Convert RAW to NV21 with Matrix.
 LIBYUV_API
 int RAWToNV21Matrix(const uint8_t* src_raw,
@@ -4226,6 +4271,32 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
     RAWToARGBRow = RAWToARGBRow_RVV;
   }
 #endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
+      }
+    }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
+    if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
+      }
+    }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
+    if (TestCpuFlag(kCpuHasSVE2)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+    }
+#endif
+#if defined(HAS_ARGBTOUVMATRIXROW_SME)
+    if (TestCpuFlag(kCpuHasSME)) {
+      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+    }
+#endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
diff --git a/source/row_any.cc b/source/row_any.cc
index 9fa7227b7..82a4abe8d 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -2264,6 +2264,12 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
     memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1));                         \
   }
 
+#ifdef HAS_ARGBTOUVMATRIXROW_NEON
+ANY12MS(ARGBToUVMatrixRow_Any_NEON, ARGBToUVMatrixRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVMATRIXROW_NEON_I8MM
+ANY12MS(ARGBToUVMatrixRow_Any_NEON_I8MM, ARGBToUVMatrixRow_NEON_I8MM, 0, 4, 15)
+#endif
 #ifdef HAS_ARGBTOUVMATRIXROW_AVX2
 ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15)
 #endif
diff --git a/source/row_neon.cc b/source/row_neon.cc
index d1073352f..895e6f113 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -1918,6 +1918,72 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
 // clang-format on
 
 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
+                            int src_stride_argb,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width,
+                            const struct ArgbConstants* c) {
+  asm volatile (
+      "add         %1, %0, %1                    \n"  // src_stride + src_argb
+      "vld1.8      {d18}, [%5]                   \n"  // load kRGBToU
+      "vld1.8      {d19}, [%6]                   \n"  // load kRGBToV
+      "vmovl.s8    q8, d18                       \n"  // U coeffs in q8 (d16, d17)
+      "vmovl.s8    q9, d19                       \n"  // V coeffs in q9 (d18, d19)
+      "vdup.16     q10, d16[0]                   \n"  // U0
+      "vdup.16     q11, d16[1]                   \n"  // U1
+      "vdup.16     q12, d16[2]                   \n"  // U2
+      "vdup.16     q13, d18[0]                   \n"  // V0
+      "vdup.16     q14, d18[1]                   \n"  // V1
+      "vdup.16     q15, d18[2]                   \n"  // V2
+      "1:          \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
+      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
+      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ARGB pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ARGB pixels.
+      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
+
+      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q1, #2                    \n"
+      "vrshr.u16   q2, q2, #2                    \n"
+
+      "vmov.u16    q3, #0x8000                   \n"  // 128.0
+
+      "vmul.s16    q8, q0, q10                   \n"  // U = B * U0
+      "vmla.s16    q8, q1, q11                   \n"  // U += G * U1
+      "vmla.s16    q8, q2, q12                   \n"  // U += R * U2
+
+      "vmul.s16    q9, q0, q13                   \n"  // V = B * V0
+      "vmla.s16    q9, q1, q14                   \n"  // V += G * V1
+      "vmla.s16    q9, q2, q15                   \n"  // V += R * V2
+
+      "vsub.u16    q8, q3, q8                    \n"  // 128.0 - U
+      "vsub.u16    q9, q3, q9                    \n"  // 128.0 - V
+
+      "vqshrn.u16  d0, q8, #8                    \n"  // Saturating shift right
+      "vqshrn.u16  d1, q9, #8                    \n"
+
+      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
+      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
+      "bgt         1b                            \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(width)        // %4
+  : "r"(&c->kRGBToU),  // %5
+    "r"(&c->kRGBToV)   // %6
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
 void ARGBToUVRow_NEON(const uint8_t* src_argb,
                       int src_stride_argb,
                       uint8_t* dst_u,
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 09bad8df9..19016cc3b 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -9,6 +9,7 @@
  */
 
 #include "libyuv/row.h"
+#include "libyuv/convert_from_argb.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -2893,14 +2894,26 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
 // TODO(fbarchard): consider ptrdiff_t for all strides.
 
-void ARGBToUVRow_NEON(const uint8_t* src_argb,
-                      int src_stride_argb,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
+void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
+                            int src_stride_argb,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width,
+                            const struct ArgbConstants* c) {
   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
-    RGBTOUV_SETUP_REG
+      "ldr        q16, [%[c], #16]               \n" // kRGBToU
+      "ldr        q17, [%[c], #32]               \n" // kRGBToV
+      "sxtl       v16.8h, v16.8b                 \n" // sign extend U coeffs to 16-bit
+      "sxtl       v17.8h, v17.8b                 \n" // sign extend V coeffs to 16-bit
+      "dup        v20.8h, v16.h[0]               \n" // U0 (-BU)
+      "dup        v21.8h, v16.h[1]               \n" // U1 (-GU)
+      "dup        v22.8h, v16.h[2]               \n" // U2 (-RU)
+      "dup        v23.8h, v17.h[0]               \n" // V0 (-BV)
+      "dup        v24.8h, v17.h[1]               \n" // V1 (-GV)
+      "dup        v26.8h, v17.h[2]               \n" // V2 (-RV)
+      "movi       v25.8h, #0x80, lsl #8          \n" // 128.0 in 16-bit (0x8000)
+
       "1:          \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
@@ -2909,7 +2922,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
 
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
       "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
       "prfm        pldl1keep, [%1, 448]          \n"
       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
@@ -2919,7 +2932,20 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
       "urshr       v1.8h, v1.8h, #2              \n"
       "urshr       v2.8h, v2.8h, #2              \n"
 
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
+      // U = B*U0 + G*U1 + R*U2
+      "mul        v3.8h, v0.8h, v20.8h          \n"
+      "mla        v3.8h, v1.8h, v21.8h          \n"
+      "mla        v3.8h, v2.8h, v22.8h          \n"
+
+      // V = B*V0 + G*V1 + R*V2
+      "mul        v4.8h, v0.8h, v23.8h          \n"
+      "mla        v4.8h, v1.8h, v24.8h          \n"
+      "mla        v4.8h, v2.8h, v26.8h          \n"
+
+      // U = (128.0 - U) >> 8, V = (128.0 - V) >> 8
+      "subhn      v0.8b, v25.8h, v3.8h           \n"
+      "subhn      v1.8b, v25.8h, v4.8h           \n"
+
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
       "b.gt        1b                            \n"
@@ -2928,12 +2954,21 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(width)        // %4
-  :
+  : [c] "r"(c)         // %5
   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
+    "v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26"
   );
 }
 
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width,
+                         &kArgbI601Constants);
+}
+
 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
@@ -3449,7 +3484,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
 }
 
 // Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout.
-static void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src,
+static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
                                         int src_stride,
                                         uint8_t* dst_u,
                                         uint8_t* dst_v,
@@ -3546,12 +3581,25 @@ static const int8_t kRGBAToUVCoefficients[] = {
     0, -112, 74, 38, 0, 18, 94, -112,
 };
 
+void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
+                                 int src_stride_argb,
+                                 uint8_t* dst_u,
+                                 uint8_t* dst_v,
+                                 int width,
+                                 const struct ArgbConstants* c) {
+  int8_t uvconstants[8] = {
+      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
+      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
+                                   uvconstants);
+}
+
 void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
                            int src_stride_argb,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width,
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
                               kARGBToUVCoefficients);
 }
 
@@ -3560,7 +3608,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width,
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
                               kABGRToUVCoefficients);
 }
 
@@ -3569,7 +3617,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM(src_bgra, src_stride_bgra, dst_u, dst_v, width,
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width,
                               kBGRAToUVCoefficients);
 }
 
@@ -3578,7 +3626,7 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  ARGBToUVMatrixRow_NEON_I8MM(src_rgba, src_stride_rgba, dst_u, dst_v, width,
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width,
                               kRGBAToUVCoefficients);
 }
 
@@ -3606,7 +3654,7 @@ void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width) {
-  ARGBToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width,
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
                               kARGBToUVJCoefficients);
 }
 
@@ -3615,7 +3663,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width) {
-  ARGBToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width,
+  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
                               kABGRToUVJCoefficients);
 }
 
diff --git a/source/row_sme.cc b/source/row_sme.cc
index bd61b20bf..fca536dc4 100644
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@@ -1120,6 +1120,20 @@ __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y,
       : "cc", "memory", "z0", "z1", "z2", "p0", "p1");
 }
 
+__arm_locally_streaming void ARGBToUVMatrixRow_SME(
+    const uint8_t* src_argb,
+    int src_stride_argb,
+    uint8_t* dst_u,
+    uint8_t* dst_v,
+    int width,
+    const struct ArgbConstants* c) {
+  int8_t uvconstants[8] = {
+      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
+      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
+  ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
+                           uvconstants);
+}
+
 __arm_locally_streaming void ARGBToUVRow_SME(const uint8_t* src_argb,
                                              int src_stride_argb,
                                              uint8_t* dst_u,
diff --git a/source/row_sve.cc b/source/row_sve.cc
index 4a51b68fc..7d8734921 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -217,6 +217,19 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
   NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
 }
 
+void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
+                            int src_stride_argb,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width,
+                            const struct ArgbConstants* c) {
+  int8_t uvconstants[8] = {
+      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
+      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
+  ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
+                           uvconstants);
+}
+
 void ARGBToUVRow_SVE2(const uint8_t* src_argb,
                       int src_stride_argb,
                       uint8_t* dst_u,
diff --git a/source/row_win.cc b/source/row_win.cc
index 25f3ac9fe..77070d031 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -314,6 +314,95 @@ void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int wi
 }
 #endif
 
+#ifdef HAS_ARGBTOUVMATRIXROW_AVX2
+LIBYUV_TARGET_AVX2 __attribute__((no_sanitize("cfi-icall")))
+void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
+                            int src_stride_argb,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width,
+                            const struct ArgbConstants* c) {
+  __m256i ymm_u = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU));
+  __m256i ymm_v = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV));
+  __m256i ymm_0101 = _mm256_set1_epi16(0x0101);
+  __m256i ymm_shuf = _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
+                                      0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
+  __m256i ymm_8000 = _mm256_set1_epi16((short)0x8000);
+  __m256i ymm_zero = _mm256_setzero_si256();
+
+  while (width > 0) {
+    __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb);
+    __m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
+    __m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb));
+    __m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32));
+
+    ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
+    ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
+    ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf);
+    ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf);
+
+    ymm0 = _mm256_maddubs_epi16(ymm0, ymm_0101);
+    ymm1 = _mm256_maddubs_epi16(ymm1, ymm_0101);
+    ymm2 = _mm256_maddubs_epi16(ymm2, ymm_0101);
+    ymm3 = _mm256_maddubs_epi16(ymm3, ymm_0101);
+
+    ymm0 = _mm256_add_epi16(ymm0, ymm2);
+    ymm1 = _mm256_add_epi16(ymm1, ymm3);
+
+    ymm0 = _mm256_srli_epi16(ymm0, 1);
+    ymm1 = _mm256_srli_epi16(ymm1, 1);
+    ymm0 = _mm256_avg_epu16(ymm0, ymm_zero);
+    ymm1 = _mm256_avg_epu16(ymm1, ymm_zero);
+
+    ymm0 = _mm256_packus_epi16(ymm0, ymm1);
+    ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8);
+
+    ymm1 = _mm256_maddubs_epi16(ymm0, ymm_v);
+    ymm0 = _mm256_maddubs_epi16(ymm0, ymm_u);
+
+    ymm0 = _mm256_hadd_epi16(ymm0, ymm1);
+    ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8);
+    ymm0 = _mm256_sub_epi16(ymm_8000, ymm0);
+    ymm0 = _mm256_srli_epi16(ymm0, 8);
+    ymm0 = _mm256_packus_epi16(ymm0, ymm0);
+
+    __m128i xmm_u = _mm256_castsi256_si128(ymm0);
+    __m128i xmm_v = _mm256_extracti128_si256(ymm0, 1);
+
+    _mm_storel_epi64((__m128i*)dst_u, xmm_u);
+    _mm_storel_epi64((__m128i*)dst_v, xmm_v);
+
+    src_argb += 64;
+    dst_u += 8;
+    dst_v += 8;
+    width -= 16;
+  }
+}
+#endif
+
+#ifdef HAS_MERGEUVROW_AVX2
+LIBYUV_TARGET_AVX2
+void MergeUVRow_AVX2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width) {
+  while (width > 0) {
+    __m256i ymm0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_u));
+    __m256i ymm1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_v));
+
+    ymm1 = _mm256_slli_epi16(ymm1, 8);
+    ymm0 = _mm256_or_si256(ymm0, ymm1);
+
+    _mm256_storeu_si256((__m256i*)dst_uv, ymm0);
+
+    src_u += 16;
+    src_v += 16;
+    dst_uv += 32;
+    width -= 16;
+  }
+}
+#endif
+
 #endif