diff --git a/source/row_sve.cc b/source/row_sve.cc
index a4acb69a4..7251fe79d 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -217,9 +217,7 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
   NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
 }
 
-// Dot-product constants are stored as four-tuples with the two innermost
-// elements flipped to account for the interleaving nature of the widening
-// addition instructions.
+// SVE constants are stored negated such that we can store 128 in int8_t.
 
 // RGB to BT601 coefficients
 // UB   0.875 coefficient = 112
@@ -229,25 +227,24 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
 // VG -0.7344 coefficient = -94
 // VR   0.875 coefficient = 112
 
-// SVE constants are not negated
-static const int16_t kARGBToUVCoefficients[] = {
-    // UB, -UR, -UG, 0, -VB, VR, -VG, 0
-    112, -38, -74, 0, -18, 112, -94, 0,
+static const int8_t kARGBToUVCoefficients[] = {
+    // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
+    -112, 74, 38, 0, 18, 94, -112, 0,
 };
 
-static const int16_t kRGBAToUVCoefficients[] = {
-    // 0, -UG, UB, -UR, 0, -VG, -VB, VR
-    0, -74, 112, -38, 0, -94, -18, 112,
+static const int8_t kABGRToUVCoefficients[] = {
+    // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
+    38, 74, -112, 0, -112, 94, 18, 0,
 };
 
-static const int16_t kBGRAToUVCoefficients[] = {
-    // 0, -UG, -UR, UB, 0, -VG, VR, -VB
-    0, -74, -38, 112, 0, -94, 112, -18,
+static const int8_t kBGRAToUVCoefficients[] = {
+    // 0, -UR, -UG, -UB, 0, -VR, -VG, -VB
+    0, 38, 74, -112, 0, -112, 94, 18,
 };
 
-static const int16_t kABGRToUVCoefficients[] = {
-    // -UR, UB, -UG, 0, VR, -VB, -VG, 0
-    -38, 112, -74, 0, 112, -18, -94, 0,
+static const int8_t kRGBAToUVCoefficients[] = {
+    // 0, -UB, -UG, -UR, 0, -VB, -VG, -VR
+    0, -112, 74, 38, 0, 18, 94, -112,
 };
 
 // RGB to JPEG coefficients
@@ -258,169 +255,138 @@ static const int16_t kABGRToUVCoefficients[] = {
 // VG -0.41869  coefficient = -107
 // VR 0.500     coefficient = 128
 
-static const int16_t kARGBToUVJCoefficients[] = {
-    // UB, -UR, -UG, 0, -VB, VR, -VG, 0
-    128, -43, -85, 0, -21, 128, -107, 0,
+static const int8_t kARGBToUVJCoefficients[] = {
+    // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
+    -128, 85, 43, 0, 21, 107, -128, 0,
 };
 
-static const int16_t kABGRToUVJCoefficients[] = {
-    // -UR, UB, -UG, 0, VR, -VB, -VG, 0
-    -43, 128, -85, 0, 128, -21, -107, 0,
+static const int8_t kABGRToUVJCoefficients[] = {
+    // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
+    43, 85, -128, 0, -128, 107, 21, 0,
 };
 
+#define ABCDTOUVMATRIX_SVE                                                  \
+  "ld1d     {z0.d}, p1/z, [%[src0]]               \n" /* ABCD(bgra) */      \
+  "ld1d     {z1.d}, p2/z, [%[src0], #1, mul vl]   \n" /* EFGH(bgra) */      \
+  "ld1d     {z2.d}, p3/z, [%[src0], #2, mul vl]   \n" /* IJKL(bgra) */      \
+  "ld1d     {z3.d}, p4/z, [%[src0], #3, mul vl]   \n" /* MNOP(bgra) */      \
+  "ld1d     {z4.d}, p1/z, [%[src1]]               \n" /* ABCD(bgra) */      \
+  "ld1d     {z5.d}, p2/z, [%[src1], #1, mul vl]   \n" /* EFGH(bgra) */      \
+  "ld1d     {z6.d}, p3/z, [%[src1], #2, mul vl]   \n" /* IJKL(bgra) */      \
+  "ld1d     {z7.d}, p4/z, [%[src1], #3, mul vl]   \n" /* MNOP(bgra) */      \
+  "incb     %[src0], all, mul #4                  \n"                       \
+  "incb     %[src1], all, mul #4                  \n"                       \
+                                                                            \
+  "uaddlb   z16.h, z0.b, z4.b                     \n" /* ABCD(br) */        \
+  "uaddlb   z18.h, z1.b, z5.b                     \n" /* EFGH(br) */        \
+  "uaddlb   z20.h, z2.b, z6.b                     \n" /* IJKL(br) */        \
+  "uaddlb   z22.h, z3.b, z7.b                     \n" /* MNOP(br) */        \
+  "uaddlt   z17.h, z0.b, z4.b                     \n" /* ABCD(ga) */        \
+  "uaddlt   z19.h, z1.b, z5.b                     \n" /* EFGH(ga) */        \
+  "uaddlt   z21.h, z2.b, z6.b                     \n" /* IJKL(ga) */        \
+  "uaddlt   z23.h, z3.b, z7.b                     \n" /* MNOP(ga) */        \
+                                                                            \
+  /* Use ADDP on 32-bit elements to add adjacent pairs of 9-bit unsigned */ \
+  "addp     z16.s, p0/m, z16.s, z18.s             \n" /* ABEFCDGH(br) */    \
+  "addp     z17.s, p0/m, z17.s, z19.s             \n" /* ABEFCDGH(ga) */    \
+  "addp     z20.s, p0/m, z20.s, z22.s             \n" /* IJMNKLOP(br) */    \
+  "addp     z21.s, p0/m, z21.s, z23.s             \n" /* IJMNKLOP(ga) */    \
+                                                                            \
+  "rshrnb    z0.b, z16.h, #2                      \n" /* ABEFCDGH(b0r0) */  \
+  "rshrnb    z1.b, z20.h, #2                      \n" /* IJMNKLOP(b0r0) */  \
+  "rshrnt    z0.b, z17.h, #2                      \n" /* ABEFCDGH(bgra) */  \
+  "rshrnt    z1.b, z21.h, #2                      \n" /* IJMNKLOP(bgra) */  \
+                                                                            \
+  "tbl       z0.s, {z0.s}, z27.s                  \n" /* ABCDEFGH */        \
+  "tbl       z1.s, {z1.s}, z27.s                  \n" /* IJKLMNOP */        \
+                                                                            \
+  "subs     %w[width], %w[width], %w[vl], lsl #2  \n" /* VL per loop */     \
+                                                                            \
+  "movi     v16.8h, #0                            \n"                       \
+  "movi     v17.8h, #0                            \n"                       \
+  "movi     v20.8h, #0                            \n"                       \
+  "movi     v21.8h, #0                            \n"                       \
+                                                                            \
+  "usdot     z16.s, z0.b, z24.b                   \n"                       \
+  "usdot     z17.s, z1.b, z24.b                   \n"                       \
+  "usdot     z20.s, z0.b, z25.b                   \n"                       \
+  "usdot     z21.s, z1.b, z25.b                   \n"                       \
+                                                                            \
+  "subhnb   z16.b, z26.h, z16.h                   \n" /* U */               \
+  "subhnb   z20.b, z26.h, z20.h                   \n" /* V */               \
+  "subhnb   z17.b, z26.h, z17.h                   \n" /* U */               \
+  "subhnb   z21.b, z26.h, z21.h                   \n" /* V */               \
+                                                                            \
+  "uzp1     z16.h, z16.h, z17.h                   \n"                       \
+  "uzp1     z20.h, z20.h, z21.h                   \n"                       \
+                                                                            \
+  "st1b     {z16.h}, p5, [%[dst_u]]               \n" /* U */               \
+  "st1b     {z20.h}, p5, [%[dst_v]]               \n" /* V */               \
+  "inch     %[dst_u]                              \n"                       \
+  "inch     %[dst_v]                              \n"
+
 static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
                                    int src_stride_argb,
                                    uint8_t* dst_u,
                                    uint8_t* dst_v,
                                    int width,
-                                   const int16_t* uvconstants) {
+                                   const int8_t* uvconstants) {
   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
   uint64_t vl;
+  asm("cntd %x0" : "=r"(vl));
+
+  // Width is a multiple of two here, so halve it.
+  width >>= 1;
+
   asm volatile(
-      "ptrue    p0.b                                \n"
-      "ld1rd    {z24.d}, p0/z, [%[uvconstants]]     \n"
-      "ld1rd    {z25.d}, p0/z, [%[uvconstants], #8] \n"
-      "mov      z26.h, #0x8000                      \n"  // 128.0 (0x8000)
-      "cntb     %[vl]                               \n"
-      "subs     %w[width], %w[width], %w[vl]        \n"
-      "b.lt     2f                                  \n"
+      "ptrue    p0.b                                 \n"
+      "ld1rw    {z24.s}, p0/z, [%[uvconstants]]      \n"
+      "ld1rw    {z25.s}, p0/z, [%[uvconstants], #4]  \n"
+      "mov      z26.h, #0x8000                       \n"  // 128.0 (0x8000)
 
-      // Process 4x vectors from each input row per iteration.
-      // Cannot use predication here due to unrolling.
-      "1:                                           \n"  // e.g.
-      "ld1b     {z0.b}, p0/z, [%[src0], #0, mul vl] \n"  // bgrabgra
-      "ld1b     {z4.b}, p0/z, [%[src1], #0, mul vl] \n"  // bgrabgra
-      "ld1b     {z1.b}, p0/z, [%[src0], #1, mul vl] \n"  // bgrabgra
-      "ld1b     {z5.b}, p0/z, [%[src1], #1, mul vl] \n"  // bgrabgra
-      "ld1b     {z2.b}, p0/z, [%[src0], #2, mul vl] \n"  // bgrabgra
-      "ld1b     {z6.b}, p0/z, [%[src1], #2, mul vl] \n"  // bgrabgra
-      "ld1b     {z3.b}, p0/z, [%[src0], #3, mul vl] \n"  // bgrabgra
-      "ld1b     {z7.b}, p0/z, [%[src1], #3, mul vl] \n"  // bgrabgra
-      "incb     %[src0], all, mul #4                \n"
-      "incb     %[src1], all, mul #4                \n"
+      // Generate some TBL indices to undo the interleaving from ADDP.
+      "index    z0.s, #0, #1                         \n"
+      "index    z1.s, #1, #1                         \n"
+      "uzp1     z27.s, z0.s, z1.s                    \n"
 
-      "uaddlb   z16.h, z0.b, z4.b                   \n"  // brbrbrbr
-      "uaddlt   z17.h, z0.b, z4.b                   \n"  // gagagaga
-      "uaddlb   z18.h, z1.b, z5.b                   \n"  // brbrbrbr
-      "uaddlt   z19.h, z1.b, z5.b                   \n"  // gagagaga
-      "uaddlb   z20.h, z2.b, z6.b                   \n"  // brbrbrbr
-      "uaddlt   z21.h, z2.b, z6.b                   \n"  // gagagaga
-      "uaddlb   z22.h, z3.b, z7.b                   \n"  // brbrbrbr
-      "uaddlt   z23.h, z3.b, z7.b                   \n"  // gagagaga
+      "subs     %w[width], %w[width], %w[vl], lsl #2 \n"
+      "b.lt    2f                                    \n"
 
-      "trn1     z0.s, z16.s, z17.s                  \n"  // brgabgra
-      "trn2     z1.s, z16.s, z17.s                  \n"  // brgabgra
-      "trn1     z2.s, z18.s, z19.s                  \n"  // brgabgra
-      "trn2     z3.s, z18.s, z19.s                  \n"  // brgabgra
-      "trn1     z4.s, z20.s, z21.s                  \n"  // brgabgra
-      "trn2     z5.s, z20.s, z21.s                  \n"  // brgabgra
-      "trn1     z6.s, z22.s, z23.s                  \n"  // brgabgra
-      "trn2     z7.s, z22.s, z23.s                  \n"  // brgabgra
+      "ptrue  p1.d                                   \n"
+      "ptrue  p2.d                                   \n"
+      "ptrue  p3.d                                   \n"
+      "ptrue  p4.d                                   \n"
+      "ptrue  p5.h                                   \n"
+      "1:                                            \n"  //
+      ABCDTOUVMATRIX_SVE
+      "b.gt     1b                                   \n"
 
-      "subs     %w[width], %w[width], %w[vl]        \n"  // 4*VL per loop
+      "2:                                            \n"
+      "adds    %w[width], %w[width], %w[vl], lsl #2  \n"
+      "b.eq    99f                                   \n"
 
-      "add      z0.h, p0/m, z0.h, z1.h              \n"  // brgabrga
-      "add      z2.h, p0/m, z2.h, z3.h              \n"  // brgabrga
-      "add      z4.h, p0/m, z4.h, z5.h              \n"  // brgabrga
-      "add      z6.h, p0/m, z6.h, z7.h              \n"  // brgabrga
+      "3:                                            \n"
+      "whilelt  p1.d, wzr, %w[width]                 \n"
+      "whilelt  p2.d, %w[vl], %w[width]              \n"
+      "whilelt  p3.d, %w[vl2], %w[width]             \n"
+      "whilelt  p4.d, %w[vl3], %w[width]             \n"
+      "whilelt  p5.h, wzr, %w[width]                 \n"  //
+      ABCDTOUVMATRIX_SVE
+      "b.gt     3b                                   \n"
 
-      "urshr    z0.h, p0/m, z0.h, #2                \n"  // brgabrga
-      "urshr    z2.h, p0/m, z2.h, #2                \n"  // brgabrga
-      "urshr    z4.h, p0/m, z4.h, #2                \n"  // brgabrga
-      "urshr    z6.h, p0/m, z6.h, #2                \n"  // brgabrga
-
-      "movi     v16.8h, #0                          \n"
-      "movi     v17.8h, #0                          \n"
-      "movi     v18.8h, #0                          \n"
-      "movi     v19.8h, #0                          \n"
-
-      "movi     v20.8h, #0                          \n"
-      "movi     v21.8h, #0                          \n"
-      "movi     v22.8h, #0                          \n"
-      "movi     v23.8h, #0                          \n"
-
-      "sdot     z16.d, z0.h, z24.h                  \n"  // UUxxxxxx
-      "sdot     z17.d, z2.h, z24.h                  \n"  // UUxxxxxx
-      "sdot     z18.d, z4.h, z24.h                  \n"  // UUxxxxxx
-      "sdot     z19.d, z6.h, z24.h                  \n"  // UUxxxxxx
-
-      "sdot     z20.d, z0.h, z25.h                  \n"  // VVxxxxxx
-      "sdot     z21.d, z2.h, z25.h                  \n"  // VVxxxxxx
-      "sdot     z22.d, z4.h, z25.h                  \n"  // VVxxxxxx
-      "sdot     z23.d, z6.h, z25.h                  \n"  // VVxxxxxx
-
-      "uzp1     z16.s, z16.s, z17.s                 \n"  // UUxx
-      "uzp1     z18.s, z18.s, z19.s                 \n"  // UUxx
-      "uzp1     z20.s, z20.s, z21.s                 \n"  // VVxx
-      "uzp1     z22.s, z22.s, z23.s                 \n"  // VVxx
-
-      "uzp1     z16.h, z16.h, z18.h                 \n"  // UU
-      "uzp1     z20.h, z20.h, z22.h                 \n"  // VV
-
-      "addhnb   z16.b, z16.h, z26.h                 \n"  // U
-      "addhnb   z20.b, z20.h, z26.h                 \n"  // V
-
-      "st1b     {z16.h}, p0, [%[dst_u]]             \n"  // U
-      "st1b     {z20.h}, p0, [%[dst_v]]             \n"  // V
-      "inch     %[dst_u]                            \n"
-      "inch     %[dst_v]                            \n"
-
-      "b.ge     1b                                  \n"
-
-      "2:                                           \n"
-      "adds     %w[width], %w[width], %w[vl]        \n"  // VL per loop
-      "b.le     99f                                 \n"
-
-      // Process remaining pixels from each input row.
-      // Use predication to do one vector from each input array, so may loop up
-      // to three iterations.
-      "cntw     %x[vl]                              \n"
-
-      "3:                                           \n"
-      "whilelt  p1.s, wzr, %w[width]                \n"
-      "ld1d     {z0.d}, p1/z, [%[src0]]             \n"  // bgrabgra
-      "ld1d     {z4.d}, p1/z, [%[src1]]             \n"  // bgrabgra
-      "incb     %[src0]                             \n"
-      "incb     %[src1]                             \n"
-
-      "uaddlb   z16.h, z0.b, z4.b                   \n"  // brbrbrbr
-      "uaddlt   z17.h, z0.b, z4.b                   \n"  // gagagaga
-
-      "trn1     z0.s, z16.s, z17.s                  \n"  // brgabgra
-      "trn2     z1.s, z16.s, z17.s                  \n"  // brgabgra
-
-      "add      z0.h, p0/m, z0.h, z1.h              \n"  // brgabrga
-
-      "urshr    z0.h, p0/m, z0.h, #2                \n"  // brgabrga
-
-      "subs     %w[width], %w[width], %w[vl]        \n"  // VL per loop
-
-      "movi     v16.8h, #0                          \n"
-      "movi     v20.8h, #0                          \n"
-
-      "sdot     z16.d, z0.h, z24.h                  \n"
-      "sdot     z20.d, z0.h, z25.h                  \n"
-
-      "addhnb   z16.b, z16.h, z26.h                 \n"  // U
-      "addhnb   z20.b, z20.h, z26.h                 \n"  // V
-
-      "st1b     {z16.d}, p0, [%[dst_u]]             \n"  // U
-      "st1b     {z20.d}, p0, [%[dst_v]]             \n"  // V
-      "incd     %[dst_u]                            \n"
-      "incd     %[dst_v]                            \n"
-      "b.gt     3b                                  \n"
-
-      "99:                                          \n"
-      : [src0] "+r"(src_argb),    // %[src0]
-        [src1] "+r"(src_argb_1),  // %[src1]
-        [dst_u] "+r"(dst_u),      // %[dst_u]
-        [dst_v] "+r"(dst_v),      // %[dst_v]
-        [width] "+r"(width),      // %[width]
-        [vl] "=&r"(vl)            // %[vl]
-      : [uvconstants] "r"(uvconstants)
+      "99:                                           \n"
+      : [src0] "+r"(src_argb),           // %[src0]
+        [src1] "+r"(src_argb_1),         // %[src1]
+        [dst_u] "+r"(dst_u),             // %[dst_u]
+        [dst_v] "+r"(dst_v),             // %[dst_v]
+        [width] "+r"(width)              // %[width]
+      : [uvconstants] "r"(uvconstants),  // %[uvconstants]
+        [vl] "r"(vl),                    // %[vl]
+        [vl2] "r"(vl * 2),               // %[vl2]
+        [vl3] "r"(vl * 3)                // %[vl3]
       : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16",
         "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26",
-        "p0");
+        "z27", "p0", "p1", "p2", "p3", "p4", "p5");
 }
 
 void ARGBToUVRow_SVE2(const uint8_t* src_argb,