diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 40272cf5a..1efb6ccc8 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -433,7 +433,9 @@ extern "C" {
 #define HAS_ARGBTOUV444ROW_NEON
 #define HAS_ARGBTOUVJ444ROW_NEON
 #define HAS_ARGBTOUVJROW_NEON
+#if !defined(__GNUC__) || defined(__clang__)
 #define HAS_ARGBTOUVMATRIXROW_NEON
+#endif
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
 #if !defined(__aarch64__)
diff --git a/source/convert.cc b/source/convert.cc
index d9fb54778..c64042691 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -2263,12 +2263,16 @@ ARGBToUVMatrixRow_C;
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
     if (TestCpuFlag(kCpuHasSVE2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+      if (IS_ALIGNED(width, 2)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+      }
     }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SME)
     if (TestCpuFlag(kCpuHasSME)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+      if (IS_ALIGNED(width, 2)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+      }
     }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index 2c66611e6..db45f4794 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -592,12 +592,16 @@ ARGBToUVMatrixRow_C;
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
     if (TestCpuFlag(kCpuHasSVE2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+      if (IS_ALIGNED(width, 2)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+      }
     }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SME)
     if (TestCpuFlag(kCpuHasSME)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+      if (IS_ALIGNED(width, 2)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+      }
     }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
@@ -959,12 +963,16 @@ ARGBToUVMatrixRow_C;
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
     if (TestCpuFlag(kCpuHasSVE2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+      if (IS_ALIGNED(width, 2)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+      }
     }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SME)
     if (TestCpuFlag(kCpuHasSME)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+      if (IS_ALIGNED(width, 2)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+      }
     }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
@@ -4289,12 +4297,16 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
     if (TestCpuFlag(kCpuHasSVE2)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+      if (IS_ALIGNED(width, 2)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
+      }
     }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SME)
     if (TestCpuFlag(kCpuHasSME)) {
-      ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+      if (IS_ALIGNED(width, 2)) {
+        ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
+      }
     }
 #endif
 #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
diff --git a/source/row_any.cc b/source/row_any.cc
index 82a4abe8d..81e0f44fb 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -2201,6 +2201,7 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
     SIMD_ALIGNED(uint8_t vin[256 * 2]);                                      \
     SIMD_ALIGNED(uint8_t vout[256 * 2]);                                     \
     memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    memset(vout, 0, sizeof(vout)); /* for msan */                            \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
@@ -2244,6 +2245,7 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
     SIMD_ALIGNED(uint8_t vin[256 * 2]);                                      \
     SIMD_ALIGNED(uint8_t vout[256 * 2]);                                     \
     memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    memset(vout, 0, sizeof(vout)); /* for msan */                            \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 895e6f113..d893dd7e3 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -1924,8 +1924,8 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
                             uint8_t* dst_v,
                             int width,
                             const struct ArgbConstants* c) {
+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
-      "add         %1, %0, %1                    \n"  // src_stride + src_argb
       "vld1.8      {d18}, [%5]                   \n"  // load kRGBToU
       "vld1.8      {d19}, [%6]                   \n"  // load kRGBToV
       "vmovl.s8    q8, d18                       \n"  // U coeffs in q8 (d16, d17)
@@ -1936,6 +1936,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
       "vdup.16     q13, d18[0]                   \n"  // V0
       "vdup.16     q14, d18[1]                   \n"  // V1
       "vdup.16     q15, d18[2]                   \n"  // V2
+
       "1:          \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
       "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels.
@@ -1963,17 +1964,14 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
       "vmla.s16    q9, q1, q14                   \n"  // V += G * V1
       "vmla.s16    q9, q2, q15                   \n"  // V += R * V2
 
-      "vsub.u16    q8, q3, q8                    \n"  // 128.0 - U
-      "vsub.u16    q9, q3, q9                    \n"  // 128.0 - V
-
-      "vqshrn.u16  d0, q8, #8                    \n"  // Saturating shift right
-      "vqshrn.u16  d1, q9, #8                    \n"
+      "vsubhn.s16  d0, q3, q8                    \n"  // 128.0 - U
+      "vsubhn.s16  d1, q3, q9                    \n"  // 128.0 - V
 
       "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
       "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
       "bgt         1b                            \n"
   : "+r"(src_argb),  // %0
-    "+r"(src_stride_argb),  // %1
+    "+r"(src_argb_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(width)        // %4
diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc
index 177f3a669..e309b38bb 100644
--- a/unit_test/convert_argb_test.cc
+++ b/unit_test/convert_argb_test.cc
@@ -2809,6 +2809,46 @@ TEST_F(LibYUVConvertTest, TestARGBToUVRow) {
 }
 #endif
 
+#ifdef ENABLE_ROW_TESTS
+TEST_F(LibYUVConvertTest, TestARGBToUVMatrixRow_Opt) {
+  const int kMaxWidth = 128;
+  SIMD_ALIGNED(uint8_t orig_argb_pixels[kMaxWidth * 4 * 2]);
+  SIMD_ALIGNED(uint8_t dest_u_c[kMaxWidth]);
+  SIMD_ALIGNED(uint8_t dest_v_c[kMaxWidth]);
+  SIMD_ALIGNED(uint8_t dest_u_opt[kMaxWidth]);
+  SIMD_ALIGNED(uint8_t dest_v_opt[kMaxWidth]);
+
+  for (int i = 0; i < kMaxWidth * 4 * 2; ++i) {
+    orig_argb_pixels[i] = i * 43;
+  }
+
+#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
+  int has_neon = TestCpuFlag(kCpuHasNEON);
+  if (has_neon) {
+    for (int width = 1; width <= kMaxWidth; ++width) {
+      for (int height = 1; height <= 2; ++height) {
+        memset(dest_u_c, 0, sizeof(dest_u_c));
+        memset(dest_v_c, 0, sizeof(dest_v_c));
+        memset(dest_u_opt, 0, sizeof(dest_u_opt));
+        memset(dest_v_opt, 0, sizeof(dest_v_opt));
+        
+        int src_stride = (height == 1) ? 0 : kMaxWidth * 4;
+
+        ARGBToUVMatrixRow_C(&orig_argb_pixels[0], src_stride, &dest_u_c[0], &dest_v_c[0], width, &kArgbI601Constants);
+        ARGBToUVMatrixRow_Any_NEON(&orig_argb_pixels[0], src_stride, &dest_u_opt[0], &dest_v_opt[0], width, &kArgbI601Constants);
+
+        int half_width = (width + 1) / 2;
+        for (int i = 0; i < half_width; ++i) {
+          EXPECT_EQ(dest_u_c[i], dest_u_opt[i]) << "u mismatch at " << i << " width " << width << " height " << height;
+          EXPECT_EQ(dest_v_c[i], dest_v_opt[i]) << "v mismatch at " << i << " width " << width << " height " << height;
+        }
+      }
+    }
+  }
+#endif
+}
+#endif
+
 #if !defined(DISABLE_SLOW_TESTS) && \
     (defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__))
 // TODO(fbarchard): Consider _set_new_mode(0) to make malloc return NULL