ABGRToJ420 call ARGBToI420Matrix

- Standardize libyuv ARGB-family (ARGB, ABGR, RGBA, BGRA) to YUV conversion by utilizing the generic MatrixRow architecture and explicit ArgbConstants. - Consolidated ARGBToI420, ABGRToI420, BGRAToI420, and RGBAToI420 as wrappers for ARGBToI420Matrix. - Refactored ABGRToJ420, ABGRToJ422, and ABGRToI422 to use generic matrix functions. - Added matrix-based versions for NV21, I400, YUY2, and UYVY. - Updated RAW and RGB24 to I420/I422/I444 dispatchers to use MatrixRow logic and explicit constants. - Fixed parameter swap bugs in ARGBToI422, ARGBToJ422, and ABGRToJ422. - Fixed a bug in the generic C implementation of matrix row functions ensuring all 4 channels are processed correctly for all ARGB-family formats. - Moved kShuffleAARRGGBB in row_gcc.cc to the top of the libyuv namespace for visibility. - Cleaned up redundant format-specific row implementations. Bug: libyuv:42280902 Change-Id: I67ffa4c476abc0d2dcc4650510d7bda91b65988e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7830291 Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
2026-06-15 08:26:06 +08:00 · 2026-05-07 19:58:19 -07:00 · 2026-05-07 19:58:19 -07:00 · 4b4e68b372
commit 4b4e68b372
parent 4aacbbdfb4
10 changed files with 2858 additions and 3678 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1936
+Version: 1937
 Revision: DEPS
 License: BSD-3-Clause
 License File: LICENSE
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@ -875,6 +875,19 @@ int BGRAToI420(const uint8_t* src_bgra,
               int width,
               int height);
 // BGRA little endian (argb in memory) to I422.
 LIBYUV_API
 int BGRAToI422(const uint8_t* src_bgra,
               int src_stride_bgra,
               uint8_t* dst_y,
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height);
 // ABGR little endian (rgba in memory) to I420.
 LIBYUV_API
 int ABGRToI420(const uint8_t* src_abgr,
@ -888,6 +901,19 @@ int ABGRToI420(const uint8_t* src_abgr,
               int width,
               int height);
 // ABGR little endian (rgba in memory) to I422.
 LIBYUV_API
 int ABGRToI422(const uint8_t* src_abgr,
               int src_stride_abgr,
               uint8_t* dst_y,
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height);
 // RGBA little endian (abgr in memory) to I420.
 LIBYUV_API
 int RGBAToI420(const uint8_t* src_rgba,
@ -901,6 +927,19 @@ int RGBAToI420(const uint8_t* src_rgba,
               int width,
               int height);
 // RGBA little endian (abgr in memory) to I422.
 LIBYUV_API
 int RGBAToI422(const uint8_t* src_rgba,
               int src_stride_rgba,
               uint8_t* dst_y,
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height);
 // RGB little endian (bgr in memory) to I420.
 LIBYUV_API
 int RGB24ToI420(const uint8_t* src_rgb24,
--- a/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@ -245,6 +245,19 @@ int ARGBToI422(const uint8_t* src_argb,
               int width,
               int height);
 // Convert ABGR To I422.
 LIBYUV_API
 int ABGRToI422(const uint8_t* src_abgr,
               int src_stride_abgr,
               uint8_t* dst_y,
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height);
 // RGB to I444 with matrix. See ArgbConstants at the top of this file for usage.
 LIBYUV_API
 int ARGBToI422Matrix(const uint8_t* src_argb,
@ -458,7 +471,7 @@ int ARGBToUYVY(const uint8_t* src_argb,
 // RAW to NV21 with Matrix
 LIBYUV_API
-int RGBToNV21Matrix(const uint8_t* src_raw,
+int RAWToNV21Matrix(const uint8_t* src_raw,
                    int src_stride_raw,
                    uint8_t* dst_y,
                    int dst_stride_y,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1936
+#define LIBYUV_VERSION 1937
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/convert_from_argb.h"  // For ArgbConstants
 #include "libyuv/planar_functions.h"
 #include <assert.h>
@ -15,12 +16,10 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/row.h"
 #include "libyuv/convert_from_argb.h"
 #include "libyuv/scale_row.h"  // for ScaleRowDown2
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
@ -4745,8 +4744,8 @@ static int ARGBSobelize(const uint8_t* src_argb,
                                         uint8_t* dst,
                                         int width)) {
  int y;
-  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =
-                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
+      ARGBToYJRow_C;
  void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,
                    uint8_t* dst_sobely, int width) = SobelYRow_C;
  void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,
@ -4763,65 +4762,57 @@ static int ARGBSobelize(const uint8_t* src_argb,
    src_stride_argb = -src_stride_argb;
  }
-#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
    }
  }
 #endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
+#if defined(HAS_ARGBTOYJROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
+      ARGBToYJRow = ARGBToYJRow_AVX2;
    }
  }
 #endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
+#if defined(HAS_ARGBTOYROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
+    ARGBToYJRow = ARGBToYJRow_Any_AVX512BW;
    if (IS_ALIGNED(width, 64)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
+      ARGBToYJRow = ARGBToYJRow_AVX512BW;
    }
  }
 #endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+#if defined(HAS_ARGBTOYJROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+      ARGBToYJRow = ARGBToYJRow_NEON;
    }
  }
 #endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
+#if defined(HAS_ARGBTOYJROW_LSX)
  if (TestCpuFlag(kCpuHasNeonDotProd)) {
    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
    if (IS_ALIGNED(width, 16)) {
      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
    }
  }
 #endif
 #if defined(HAS_ARGBTOYMATRIXROW_LSX)
  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
+    ARGBToYJRow = ARGBToYJRow_Any_LSX;
    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
+      ARGBToYJRow = ARGBToYJRow_LSX;
    }
  }
 #endif
-#if defined(HAS_ARGBTOYMATRIXROW_LASX)
+#if defined(HAS_ARGBTOYJROW_LASX)
  if (TestCpuFlag(kCpuHasLASX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
+    ARGBToYJRow = ARGBToYJRow_Any_LASX;
    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
+      ARGBToYJRow = ARGBToYJRow_LASX;
    }
  }
 #endif
-#if defined(HAS_ARGBTOYMATRIXROW_RVV)
+#if defined(HAS_ARGBTOYJROW_RVV)
  if (TestCpuFlag(kCpuHasRVV)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
+    ARGBToYJRow = ARGBToYJRow_RVV;
  }
 #endif
@ -4859,10 +4850,10 @@ static int ARGBSobelize(const uint8_t* src_argb,
    uint8_t* row_y2 = row_y1 + row_size;
    if (!rows)
      return 1;
-    ARGBToYMatrixRow(src_argb, row_y0, width, &kArgbJPEGConstants);
+    ARGBToYJRow(src_argb, row_y0, width);
    row_y0[-1] = row_y0[0];
    memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
-    ARGBToYMatrixRow(src_argb, row_y1, width, &kArgbJPEGConstants);
+    ARGBToYJRow(src_argb, row_y1, width);
    row_y1[-1] = row_y1[0];
    memset(row_y1 + width, row_y1[width - 1], 16);
    memset(row_y2 + width, 0, 16);
@ -4872,7 +4863,7 @@ static int ARGBSobelize(const uint8_t* src_argb,
      if (y < (height - 1)) {
        src_argb += src_stride_argb;
      }
-      ARGBToYMatrixRow(src_argb, row_y2, width, &kArgbJPEGConstants);
+      ARGBToYJRow(src_argb, row_y2, width);
      row_y2[-1] = row_y2[0];
      row_y2[width] = row_y2[width - 1];
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -753,28 +753,31 @@ MAKEROWYJ(ABGR, 0, 1, 2, 4)
 MAKEROWYJ(RGBA, 3, 2, 1, 4)
 #undef MAKEROWYJ
-static __inline uint8_t RGBToYMatrix(uint8_t r,
+static __inline uint8_t RGBToYMatrix(uint8_t b0,
-                                     uint8_t g,
+                                     uint8_t b1,
-                                     uint8_t b,
+                                     uint8_t b2,
                                     uint8_t b3,
                                     const struct ArgbConstants* c) {
-  return (c->kRGBToY[2] * r + c->kRGBToY[1] * g + c->kRGBToY[0] * b +
+  return (c->kRGBToY[0] * b0 + c->kRGBToY[1] * b1 + c->kRGBToY[2] * b2 +
-          c->kAddY[0]) >>
+          c->kRGBToY[3] * b3 + c->kAddY[0]) >>
         8;
 }
-static __inline uint8_t RGBToUMatrix(uint8_t r,
+static __inline uint8_t RGBToUMatrix(uint8_t b0,
-                                     uint8_t g,
+                                     uint8_t b1,
-                                     uint8_t b,
+                                     uint8_t b2,
                                     uint8_t b3,
                                     const struct ArgbConstants* c) {
-  return (c->kAddUV[0] -
+  return (c->kAddUV[0] - (c->kRGBToU[0] * b0 + c->kRGBToU[1] * b1 +
-          (c->kRGBToU[2] * r + c->kRGBToU[1] * g + c->kRGBToU[0] * b)) >>
+                         c->kRGBToU[2] * b2 + c->kRGBToU[3] * b3)) >>
         8;
 }
-static __inline uint8_t RGBToVMatrix(uint8_t r,
+static __inline uint8_t RGBToVMatrix(uint8_t b0,
-                                     uint8_t g,
+                                     uint8_t b1,
-                                     uint8_t b,
+                                     uint8_t b2,
                                     uint8_t b3,
                                     const struct ArgbConstants* c) {
-  return (c->kAddUV[0] -
+  return (c->kAddUV[0] - (c->kRGBToV[0] * b0 + c->kRGBToV[1] * b1 +
-          (c->kRGBToV[2] * r + c->kRGBToV[1] * g + c->kRGBToV[0] * b)) >>
+                         c->kRGBToV[2] * b2 + c->kRGBToV[3] * b3)) >>
         8;
 }
@ -784,7 +787,7 @@ void ARGBToYMatrixRow_C(const uint8_t* src_argb,
                        const struct ArgbConstants* c) {
  int x;
  for (x = 0; x < width; ++x) {
-    dst_y[0] = RGBToYMatrix(src_argb[2], src_argb[1], src_argb[0], c);
+    dst_y[0] = RGBToYMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
    src_argb += 4;
    dst_y += 1;
  }
@ -799,25 +802,28 @@ void ARGBToUVMatrixRow_C(const uint8_t* src_argb,
  const uint8_t* src_argb1 = src_argb + src_stride_argb;
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    uint8_t ab =
+    uint8_t b0 =
        (src_argb[0] + src_argb[4] + src_argb1[0] + src_argb1[4] + 2) >> 2;
-    uint8_t ag =
+    uint8_t b1 =
        (src_argb[1] + src_argb[5] + src_argb1[1] + src_argb1[5] + 2) >> 2;
-    uint8_t ar =
+    uint8_t b2 =
        (src_argb[2] + src_argb[6] + src_argb1[2] + src_argb1[6] + 2) >> 2;
-    dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
+    uint8_t b3 =
-    dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
+        (src_argb[3] + src_argb[7] + src_argb1[3] + src_argb1[7] + 2) >> 2;
    dst_u[0] = RGBToUMatrix(b0, b1, b2, b3, c);
    dst_v[0] = RGBToVMatrix(b0, b1, b2, b3, c);
    src_argb += 8;
    src_argb1 += 8;
    dst_u += 1;
    dst_v += 1;
  }
  if (width & 1) {
-    uint8_t ab = (src_argb[0] + src_argb1[0] + 1) >> 1;
+    uint8_t b0 = (src_argb[0] + src_argb1[0] + 1) >> 1;
-    uint8_t ag = (src_argb[1] + src_argb1[1] + 1) >> 1;
+    uint8_t b1 = (src_argb[1] + src_argb1[1] + 1) >> 1;
-    uint8_t ar = (src_argb[2] + src_argb1[2] + 1) >> 1;
+    uint8_t b2 = (src_argb[2] + src_argb1[2] + 1) >> 1;
-    dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
+    uint8_t b3 = (src_argb[3] + src_argb1[3] + 1) >> 1;
-    dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
+    dst_u[0] = RGBToUMatrix(b0, b1, b2, b3, c);
    dst_v[0] = RGBToVMatrix(b0, b1, b2, b3, c);
  }
 }
@ -828,11 +834,10 @@ void ARGBToUV444MatrixRow_C(const uint8_t* src_argb,
                            const struct ArgbConstants* c) {
  int x;
  for (x = 0; x < width; ++x) {
-    uint8_t ab = src_argb[0];
+    dst_u[0] =
-    uint8_t ag = src_argb[1];
+        RGBToUMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
-    uint8_t ar = src_argb[2];
+    dst_v[0] =
-    dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
+        RGBToVMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
    dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
    src_argb += 4;
    dst_u += 1;
    dst_v += 1;
@ -1513,16 +1518,16 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
      YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
 #define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV)   \
-  const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) =            \
+  extern const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) =     \
      ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV),   \
                        -(RV), 0, AY, AUV);                                    \
-  const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) =            \
+  extern const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) =     \
      ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV),   \
                        -(BV), 0, AY, AUV);                                    \
-  const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) =            \
+  extern const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) =     \
      ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV),       \
                        -(GV), -(RV), AY, AUV);                                \
-  const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) =            \
+  extern const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) =     \
      ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV),       \
                        -(GV), -(BV), AY, AUV);
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -1848,32 +1848,41 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
                               int width,
                               const struct ArgbConstants* c) {
  asm volatile(
-      "vld1.8      {d16}, [%4]                   \n"  // load kRGBToU
+      "vld1.8      {d24}, [%4]                   \n"  // load kRGBToU
-      "vld1.8      {d17}, [%5]                   \n"  // load kRGBToV
+      "vld1.8      {d25}, [%5]                   \n"  // load kRGBToV
-      "vld1.16     {d18[0]}, [%6]                \n"  // load kAddUV[0]
+      "vld1.16     {d26[0]}, [%6]                \n"  // load kAddUV[0]
-      "vabs.s8     d16, d16                      \n"  // BU, GU, RU
+      "vmovl.s8    q10, d24                      \n"  // U coeffs (8 shorts)
-      "vabs.s8     d17, d17                      \n"  // BV, GV, RV
+      "vmovl.s8    q11, d25                      \n"  // V coeffs (8 shorts)
-      "vdup.8      d20, d16[0]                   \n"  // BU
+      "vdup.16     q6, d26[0]                    \n"  // bias
      "vdup.8      d21, d16[1]                   \n"  // GU
      "vdup.8      d22, d16[2]                   \n"  // RU
      "vdup.8      d23, d17[0]                   \n"  // BV
      "vdup.8      d24, d17[1]                   \n"  // GV
      "vdup.8      d25, d17[2]                   \n"  // RV
      "vdup.16     q15, d18[0]                   \n"  // kAddUV
      "1:          \n"
      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
      "vmull.u8    q2, d0, d20                   \n"  // B * BU
      "vmlsl.u8    q2, d1, d21                   \n"  // - G * GU
      "vmlsl.u8    q2, d2, d22                   \n"  // - R * RU
-      "vmull.u8    q3, d2, d25                   \n"  // R * RV
+      "vmovl.u8    q4, d0                        \n"  // B
-      "vmlsl.u8    q3, d1, d24                   \n"  // - G * GV
+      "vmovl.u8    q5, d1                        \n"  // G
-      "vmlsl.u8    q3, d0, d23                   \n"  // - B * BV
+      "vmovl.u8    q7, d2                        \n"  // R
      "vmovl.u8    q8, d3                        \n"  // A
-      "vaddhn.u16  d0, q2, q15                   \n"  // signed -> unsigned
+      "vdup.16     q12, d20[0]                   \n"
-      "vaddhn.u16  d1, q3, q15                   \n"
+      "vmul.s16    q2, q4, q12                   \n"  // U = B * U0
      "vdup.16     q12, d20[1]                   \n"
      "vmla.s16    q2, q5, q12                   \n"  // U += G * U1
      "vdup.16     q12, d20[2]                   \n"
      "vmla.s16    q2, q7, q12                   \n"  // U += R * U2
      "vdup.16     q12, d20[3]                   \n"
      "vmla.s16    q2, q8, q12                   \n"  // U += A * U3
      "vdup.16     q12, d22[0]                   \n"
      "vmul.s16    q3, q4, q12                   \n"  // V = B * V0
      "vdup.16     q12, d22[1]                   \n"
      "vmla.s16    q3, q5, q12                   \n"  // V += G * V1
      "vdup.16     q12, d22[2]                   \n"
      "vmla.s16    q3, q7, q12                   \n"  // V += R * V2
      "vdup.16     q12, d22[3]                   \n"
      "vmla.s16    q3, q8, q12                   \n"  // V += A * V3
      "vsubhn.s16  d0, q6, q2                    \n"  // 128.0 - U
      "vsubhn.s16  d1, q6, q3                    \n"  // 128.0 - V
      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels U.
      "vst1.8      {d1}, [%2]!                   \n"  // store 8 pixels V.
@ -1885,8 +1894,8 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
      : "r"(&c->kRGBToU),   // %4
        "r"(&c->kRGBToV),   // %5
        "r"(&c->kAddUV)     // %6
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q12", "q13", "q14", "q15");
+        "q10", "q11", "q12");
 }
 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
@ -1926,16 +1935,11 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
                            const struct ArgbConstants* c) {
  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
  asm volatile (
-      "vld1.8      {d18}, [%5]                   \n"  // load kRGBToU
+      "vld1.8      {d24}, [%5]                   \n"  // load kRGBToU (8 bytes, only 4 used)
-      "vld1.8      {d19}, [%6]                   \n"  // load kRGBToV
+      "vld1.8      {d25}, [%6]                   \n"  // load kRGBToV
-      "vmovl.s8    q8, d18                       \n"  // U coeffs in q8 (d16, d17)
+      "vmovl.s8    q14, d24                      \n"  // U coeffs in d28
-      "vmovl.s8    q9, d19                       \n"  // V coeffs in q9 (d18, d19)
+      "vmovl.s8    q15, d25                      \n"  // V coeffs in d30
-      "vdup.16     q10, d16[0]                   \n"  // U0
+      "vmov.u16    q11, #0x8000                  \n"  // 128.0 bias
      "vdup.16     q11, d16[1]                   \n"  // U1
      "vdup.16     q12, d16[2]                   \n"  // U2
      "vdup.16     q13, d18[0]                   \n"  // V0
      "vdup.16     q14, d18[1]                   \n"  // V1
      "vdup.16     q15, d18[2]                   \n"  // V2
      "1:          \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
@ -1944,28 +1948,39 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ARGB pixels.
+      "vpaddl.u8   q3, q3                        \n"  // A 16 bytes -> 8 shorts.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ARGB pixels.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more pixels.
-      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"
-      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8   q0, q4                        \n"  // B
-      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
+      "vpadal.u8   q1, q5                        \n"  // G
      "vpadal.u8   q2, q6                        \n"  // R
      "vpadal.u8   q3, q7                        \n"  // A
      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
      "vrshr.u16   q1, q1, #2                    \n"
      "vrshr.u16   q2, q2, #2                    \n"
      "vrshr.u16   q3, q3, #2                    \n"
-      "vmov.u16    q3, #0x8000                   \n"  // 128.0
+      "vdup.16     q12, d28[0]                   \n"
-
+      "vmul.s16    q8, q0, q12                   \n"  // U = B * U0
-      "vmul.s16    q8, q0, q10                   \n"  // U = B * U0
+      "vdup.16     q12, d28[1]                   \n"
-      "vmla.s16    q8, q1, q11                   \n"  // U += G * U1
+      "vmla.s16    q8, q1, q12                   \n"  // U += G * U1
      "vdup.16     q12, d28[2]                   \n"
      "vmla.s16    q8, q2, q12                   \n"  // U += R * U2
      "vdup.16     q12, d28[3]                   \n"
      "vmla.s16    q8, q3, q12                   \n"  // U += A * U3
-      "vmul.s16    q9, q0, q13                   \n"  // V = B * V0
+      "vdup.16     q12, d30[0]                   \n"
-      "vmla.s16    q9, q1, q14                   \n"  // V += G * V1
+      "vmul.s16    q9, q0, q12                   \n"  // V = B * V0
-      "vmla.s16    q9, q2, q15                   \n"  // V += R * V2
+      "vdup.16     q12, d30[1]                   \n"
      "vmla.s16    q9, q1, q12                   \n"  // V += G * V1
      "vdup.16     q12, d30[2]                   \n"
      "vmla.s16    q9, q2, q12                   \n"  // V += R * V2
      "vdup.16     q12, d30[3]                   \n"
      "vmla.s16    q9, q3, q12                   \n"  // V += A * V3
-      "vsubhn.s16  d0, q3, q8                    \n"  // 128.0 - U
+      "vsubhn.s16  d0, q11, q8                   \n"  // 128.0 - U
-      "vsubhn.s16  d1, q3, q9                    \n"  // 128.0 - V
+      "vsubhn.s16  d1, q11, q9                   \n"  // 128.0 - V
      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
@ -1978,7 +1993,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
  : "r"(&c->kRGBToU),  // %5
    "r"(&c->kRGBToV)   // %6
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    "q8", "q9", "q11", "q12", "q14", "q15"
  );
 }
@ -2212,44 +2227,8 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile (
+  ARGBToUVMatrixRow_NEON(src_bgra, src_stride_bgra, dst_u, dst_v, width,
-      "add         %1, %0, %1                    \n"  // src_stride + src_bgra
+                         &kBgraI601Constants);
      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
      "vmov.u16    q15, #0x8000                  \n"  // 128.0
      "1:          \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 BGRA pixels.
      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 BGRA pixels.
      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
      "vpaddl.u8   q3, q3                        \n"  // B 16 bytes -> 8 shorts.
      "vpaddl.u8   q2, q2                        \n"  // G 16 bytes -> 8 shorts.
      "vpaddl.u8   q1, q1                        \n"  // R 16 bytes -> 8 shorts.
      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more BGRA pixels.
      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 BGRA pixels.
      "vpadal.u8   q3, q7                        \n"  // B 16 bytes -> 8 shorts.
      "vpadal.u8   q2, q6                        \n"  // G 16 bytes -> 8 shorts.
      "vpadal.u8   q1, q5                        \n"  // R 16 bytes -> 8 shorts.
      "vrshr.u16   q1, q1, #2                    \n"  // average of 4
      "vrshr.u16   q2, q2, #2                    \n"
      "vrshr.u16   q3, q3, #2                    \n"
    RGBTOUV(q3, q2, q1)
      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
      "bgt         1b                            \n"
  : "+r"(src_bgra),  // %0
    "+r"(src_stride_bgra),  // %1
    "+r"(dst_u),     // %2-
    "+r"(dst_v),     // %3
    "+r"(width)        // %4
  :
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
@ -2257,44 +2236,8 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile (
+  ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_u, dst_v, width,
-      "add         %1, %0, %1                    \n"  // src_stride + src_abgr
+                         &kAbgrI601Constants);
      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
      "vmov.u16    q15, #0x8000                  \n"  // 128.0
      "1:          \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ABGR pixels.
      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ABGR pixels.
      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ABGR pixels.
      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ABGR pixels.
      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
      "vrshr.u16   q1, q1, #2                    \n"
      "vrshr.u16   q2, q2, #2                    \n"
    RGBTOUV(q2, q1, q0)
      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
      "bgt         1b                            \n"
  : "+r"(src_abgr),  // %0
    "+r"(src_stride_abgr),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(width)        // %4
  :
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 void RGBAToUVRow_NEON(const uint8_t* src_rgba,
@ -2302,44 +2245,8 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile (
+  ARGBToUVMatrixRow_NEON(src_rgba, src_stride_rgba, dst_u, dst_v, width,
-      "add         %1, %0, %1                    \n"  // src_stride + src_rgba
+                         &kRgbaI601Constants);
      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
      "vmov.u16    q15, #0x8000                  \n"  // 128.0
      "1:          \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 RGBA pixels.
      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 RGBA pixels.
      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
      "vpaddl.u8   q0, q1                        \n"  // B 16 bytes -> 8 shorts.
      "vpaddl.u8   q1, q2                        \n"  // G 16 bytes -> 8 shorts.
      "vpaddl.u8   q2, q3                        \n"  // R 16 bytes -> 8 shorts.
      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more RGBA pixels.
      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 RGBA pixels.
      "vpadal.u8   q0, q5                        \n"  // B 16 bytes -> 8 shorts.
      "vpadal.u8   q1, q6                        \n"  // G 16 bytes -> 8 shorts.
      "vpadal.u8   q2, q7                        \n"  // R 16 bytes -> 8 shorts.
      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
      "vrshr.u16   q1, q1, #2                    \n"
      "vrshr.u16   q2, q2, #2                    \n"
    RGBTOUV(q0, q1, q2)
      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
      "bgt         1b                            \n"
  : "+r"(src_rgba),  // %0
    "+r"(src_stride_rgba),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(width)        // %4
  :
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
@ -2801,15 +2708,16 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
                            int width,
                            const struct ArgbConstants* c) {
  asm volatile(
-      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
+      "vld1.8      {d24}, [%3]                   \n"  // load kRGBToY
-      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
+      "vld1.16     {d25[0]}, [%4]                \n"  // load kAddY[0]
-      "vdup.8      d20, d16[0]                   \n"  // BY
+      "vdup.8      d20, d24[0]                   \n"  // B
-      "vdup.8      d21, d16[1]                   \n"  // GY
+      "vdup.8      d21, d24[1]                   \n"  // G
-      "vdup.8      d22, d16[2]                   \n"  // RY
+      "vdup.8      d22, d24[2]                   \n"  // R
-      "vdup.16     q12, d18[0]                   \n"  // AY
+      "vdup.8      d23, d24[3]                   \n"  // A
      "vdup.16     q12, d25[0]                   \n"  // bias
      "1:          \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of ARGB
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 pixels
      "subs        %1, %1, #16                   \n"  // 16 processed per loop.
      "vmull.u8    q8, d0, d20                   \n"  // B
      "vmull.u8    q9, d1, d20                   \n"
@ -2817,6 +2725,8 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
      "vmlal.u8    q9, d3, d21                   \n"
      "vmlal.u8    q8, d4, d22                   \n"  // R
      "vmlal.u8    q9, d5, d22                   \n"
      "vmlal.u8    q8, d6, d23                   \n"  // A
      "vmlal.u8    q9, d7, d23                   \n"
      "vaddhn.u16  d0, q8, q12                   \n"  // 16 bit to 8 bit Y
      "vaddhn.u16  d1, q9, q12                   \n"
      "vst1.8      {d0, d1}, [%2]!               \n"  // store 16 pixels Y.
@ -2826,8 +2736,8 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
        "+r"(dst_y)        // %2
      : "r"(&c->kRGBToY),  // %3
        "r"(&c->kAddY)     // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12",
-        "q12");
+        "d24", "d25");
 }
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -2846,52 +2756,20 @@ void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
 }
 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
 // Same code as ARGB, except the LD4
 static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
                                  uint8_t* dst_y,
                                  int width,
                                  const struct ArgbConstants* c) {
  asm volatile(
      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
      "vdup.8      d20, d16[0]                   \n"  // BY
      "vdup.8      d21, d16[1]                   \n"  // GY
      "vdup.8      d22, d16[2]                   \n"  // RY
      "vdup.16     q12, d18[0]                   \n"  // AY
      "1:          \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of RGBA
      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
      "vmull.u8    q8, d2, d20                   \n"  // B
      "vmull.u8    q9, d3, d20                   \n"
      "vmlal.u8    q8, d4, d21                   \n"  // G
      "vmlal.u8    q9, d5, d21                   \n"
      "vmlal.u8    q8, d6, d22                   \n"  // R
      "vmlal.u8    q9, d7, d22                   \n"
      "vaddhn.u16  d0, q8, q12                   \n"  // 16 bit to 8 bit Y
      "vaddhn.u16  d1, q9, q12                   \n"
      "vst1.8      {d0, d1}, [%1]!               \n"  // store 16 pixels Y.
      "bgt         1b                            \n"
      : "+r"(src_rgba),    // %0
        "+r"(dst_y),       // %1
        "+r"(width)        // %2
      : "r"(&c->kRGBToY),  // %3
        "r"(&c->kAddY)     // %4
      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
        "q12");
 }
 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kArgbI601Constants);
+  ARGBToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgbaI601Constants);
 }
 void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
-  RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kArgbJPEGConstants);
+  ARGBToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
 }
 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kAbgrI601Constants);
+  ARGBToYMatrixRow_NEON(src_bgra, dst_y, width, &kBgraI601Constants);
 }
 void BGRAToYJRow_NEON(const uint8_t* src_bgra, uint8_t* dst_yj, int width) {
  ARGBToYMatrixRow_NEON(src_bgra, dst_yj, width, &kBgraJPEGConstants);
 }
 void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
@ -2899,12 +2777,12 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                                 int width,
                                 const struct ArgbConstants* c) {
  asm volatile(
-      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
+      "vld1.8      {d24}, [%3]                   \n"  // load kRGBToY
-      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
+      "vld1.16     {d25[0]}, [%4]                \n"  // load kAddY[0]
-      "vdup.8      d20, d16[0]                   \n"  // BY
+      "vdup.8      d20, d24[0]                   \n"  // BY
-      "vdup.8      d21, d16[1]                   \n"  // GY
+      "vdup.8      d21, d24[1]                   \n"  // GY
-      "vdup.8      d22, d16[2]                   \n"  // RY
+      "vdup.8      d22, d24[2]                   \n"  // RY
-      "vdup.16     q12, d18[0]                   \n"  // AY
+      "vdup.16     q12, d25[0]                   \n"  // AY
      "1:          \n"
      "vld3.8      {d2, d4, d6}, [%0]!           \n"  // load 16 pixels of
                                                      // RGB24.
@ -2925,8 +2803,8 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
        "+r"(width)        // %2
      : "r"(&c->kRGBToY),  // %3
        "r"(&c->kAddY)     // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12",
-        "q12");
+        "d24", "d25");
 }
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -2736,47 +2736,61 @@ struct RgbUVConstants {
 };
 // 8x1 pixels.
-static void ARGBToUV444MatrixRow_NEON(
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
    const uint8_t* src_argb,
                               uint8_t* dst_u,
                               uint8_t* dst_v,
                               int width,
-    const struct RgbUVConstants* rgbuvconstants) {
+                               const struct ArgbConstants* c) {
  asm volatile(
-      "ldr         d0, [%4]                      \n"  // load rgbuvconstants
+      "ldr        q16, [%[c], #16]               \n" // kRGBToU
-      "dup         v24.16b, v0.b[0]              \n"  // UB  0.875 coefficient
+      "ldr        q17, [%[c], #32]               \n" // kRGBToV
-      "dup         v25.16b, v0.b[1]              \n"  // UG -0.5781 coefficient
+      "ldr        s0, [%[c], #64]                \n" // kAddUV
-      "dup         v26.16b, v0.b[2]              \n"  // UR -0.2969 coefficient
+      "sxtl       v16.8h, v16.8b                 \n" // sign extend U coeffs to 16-bit
-      "dup         v27.16b, v0.b[4]              \n"  // VB -0.1406 coefficient
+      "sxtl       v17.8h, v17.8b                 \n" // sign extend V coeffs to 16-bit
-      "dup         v28.16b, v0.b[5]              \n"  // VG -0.7344 coefficient
+      "dup        v20.8h, v16.h[0]               \n" // U0
-      "neg         v24.16b, v24.16b              \n"
+      "dup        v21.8h, v16.h[1]               \n" // U1
-      "movi        v29.8h, #0x80, lsl #8         \n"  // 128.0
+      "dup        v22.8h, v16.h[2]               \n" // U2
-
+      "dup        v23.8h, v16.h[3]               \n" // U3
      "dup        v24.8h, v17.h[0]               \n" // V0
      "dup        v26.8h, v17.h[1]               \n" // V1
      "dup        v27.8h, v17.h[2]               \n" // V2
      "dup        v28.8h, v17.h[3]               \n" // V3
      "dup        v25.8h, v0.h[0]                \n" // kAddUV
      "1:          \n"
      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
      "umull       v4.8h, v0.8b, v24.8b          \n"  // B
      "umlsl       v4.8h, v1.8b, v25.8b          \n"  // G
      "umlsl       v4.8h, v2.8b, v26.8b          \n"  // R
      "prfm        pldl1keep, [%0, 448]          \n"
-      "umull       v3.8h, v2.8b, v24.8b          \n"  // R
+      "uxtl        v4.8h, v0.8b                  \n"
-      "umlsl       v3.8h, v1.8b, v28.8b          \n"  // G
+      "uxtl        v5.8h, v1.8b                  \n"
-      "umlsl       v3.8h, v0.8b, v27.8b          \n"  // B
+      "uxtl        v6.8h, v2.8b                  \n"
      "uxtl        v7.8h, v3.8b                  \n"
-      "addhn       v0.8b, v4.8h, v29.8h          \n"  // signed -> unsigned
+      // U = B*U0 + G*U1 + R*U2 + A*U3
-      "addhn       v1.8b, v3.8h, v29.8h          \n"
+      "mul         v18.8h, v4.8h, v20.8h         \n"
      "mla         v18.8h, v5.8h, v21.8h         \n"
      "mla         v18.8h, v6.8h, v22.8h         \n"
      "mla         v18.8h, v7.8h, v23.8h         \n"
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels U.
+      // V = B*V0 + G*V1 + R*V2 + A*V3
-      "st1         {v1.8b}, [%2], #8             \n"  // store 8 pixels V.
+      "mul         v19.8h, v4.8h, v24.8h         \n"
      "mla         v19.8h, v5.8h, v26.8h         \n"
      "mla         v19.8h, v6.8h, v27.8h         \n"
      "mla         v19.8h, v7.8h, v28.8h         \n"
      "subhn       v0.8b, v25.8h, v18.8h         \n"
      "subhn       v1.8b, v25.8h, v19.8h         \n"
      "st1         {v0.8b}, [%1], #8             \n"
      "st1         {v1.8b}, [%2], #8             \n"
      "b.gt        1b                            \n"
      : "+r"(src_argb),     // %0
        "+r"(dst_u),        // %1
        "+r"(dst_v),        // %2
        "+r"(width)         // %3
-      : "r"(rgbuvconstants)  // %4
+      : [c] "r"(c)          // %4
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-        "v27", "v28", "v29");
+        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
        "v26", "v27", "v28");
 }
 static void ARGBToUV444MatrixRow_NEON_I8MM(
@ -2784,10 +2798,12 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
    uint8_t* dst_u,
    uint8_t* dst_v,
    int width,
-    const struct RgbUVConstants* rgbuvconstants) {
+    const struct ArgbConstants* c) {
  asm volatile(
-      "ld2r        {v16.4s, v17.4s}, [%[rgbuvconstants]] \n"
+      "ldr         q16, [%[c], #16]              \n" // kRGBToU
-      "movi        v29.8h, #0x80, lsl #8         \n"  // 128.0
+      "ldr         q17, [%[c], #32]              \n" // kRGBToV
      "ldr         s0, [%[c], #64]               \n" // kAddUV
      "dup         v29.8h, v0.h[0]               \n" // 128.0
      "1:          \n"
      "ldp         q0, q1, [%[src]], #32         \n"
      "subs        %w[width], %w[width], #8      \n"  // 8 processed per loop.
@ -2811,7 +2827,7 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
        [dst_u] "+r"(dst_u),      // %[dst_u]
        [dst_v] "+r"(dst_v),      // %[dst_v]
        [width] "+r"(width)       // %[width]
-      : [rgbuvconstants] "r"(rgbuvconstants)  // %[rgbuvconstants]
+      : [c] "r"(c)  // %[c]
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17",
        "v29");
 }
@ -2824,15 +2840,12 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
 // VG -0.7344 coefficient = -94
 // VR   0.875 coefficient = 112
 static const struct RgbUVConstants kARGBI601UVConstants = {{-112, 74, 38, 0},
                                                           {18, 94, -112, 0}};
 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
-                            &kARGBI601UVConstants);
+                            &kArgbI601Constants);
 }
 void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
@ -2840,26 +2853,15 @@ void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
                              uint8_t* dst_v,
                              int width) {
  ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
-                                 &kARGBI601UVConstants);
+                                 &kArgbI601Constants);
 }
 // RGB to JPEG coefficients
 // UB  0.500    coefficient = 128
 // UG -0.33126  coefficient = -85
 // UR -0.16874  coefficient = -43
 // VB -0.08131  coefficient = -21
 // VG -0.41869  coefficient = -107
 // VR 0.500     coefficient = 128
 static const struct RgbUVConstants kARGBJPEGUVConstants = {{-128, 85, 43, 0},
                                                           {21, 107, -128, 0}};
 void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
-                            &kARGBJPEGUVConstants);
+                            &kArgbJPEGConstants);
 }
 void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
@ -2867,7 +2869,7 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
                               uint8_t* dst_v,
                               int width) {
  ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
-                                 &kARGBJPEGUVConstants);
+                                 &kArgbJPEGConstants);
 }
 #define RGBTOUV_SETUP_REG                                                  \
@ -2906,12 +2908,14 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
      "ldr        q17, [%[c], #32]               \n" // kRGBToV
      "sxtl       v16.8h, v16.8b                 \n" // sign extend U coeffs to 16-bit
      "sxtl       v17.8h, v17.8b                 \n" // sign extend V coeffs to 16-bit
-      "dup        v20.8h, v16.h[0]               \n" // U0 (-BU)
+      "dup        v20.8h, v16.h[0]               \n" // U0
-      "dup        v21.8h, v16.h[1]               \n" // U1 (-GU)
+      "dup        v21.8h, v16.h[1]               \n" // U1
-      "dup        v22.8h, v16.h[2]               \n" // U2 (-RU)
+      "dup        v22.8h, v16.h[2]               \n" // U2
-      "dup        v23.8h, v17.h[0]               \n" // V0 (-BV)
+      "dup        v23.8h, v16.h[3]               \n" // U3
-      "dup        v24.8h, v17.h[1]               \n" // V1 (-GV)
+      "dup        v24.8h, v17.h[0]               \n" // V0
-      "dup        v26.8h, v17.h[2]               \n" // V2 (-RV)
+      "dup        v26.8h, v17.h[1]               \n" // V1
      "dup        v27.8h, v17.h[2]               \n" // V2
      "dup        v28.8h, v17.h[3]               \n" // V3
      "movi       v25.8h, #0x80, lsl #8          \n" // 128.0 in 16-bit (0x8000)
      "1:          \n"
@ -2921,26 +2925,31 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
      "prfm        pldl1keep, [%0, 448]          \n"
      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
      "uaddlp      v18.8h, v3.16b                \n"  // A 16 bytes -> 8 shorts.
      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%1, 448]          \n"
      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
      "uadalp      v18.8h, v7.16b                \n"  // A 16 bytes -> 8 shorts.
      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
      "urshr       v1.8h, v1.8h, #2              \n"
      "urshr       v2.8h, v2.8h, #2              \n"
      "urshr       v18.8h, v18.8h, #2             \n"
-      // U = B*U0 + G*U1 + R*U2
+      // U = B*U0 + G*U1 + R*U2 + A*U3
      "mul        v3.8h, v0.8h, v20.8h          \n"
      "mla        v3.8h, v1.8h, v21.8h          \n"
      "mla        v3.8h, v2.8h, v22.8h          \n"
      "mla        v3.8h, v18.8h, v23.8h         \n"
-      // V = B*V0 + G*V1 + R*V2
+      // V = B*V0 + G*V1 + R*V2 + A*V3
-      "mul        v4.8h, v0.8h, v23.8h          \n"
+      "mul        v4.8h, v0.8h, v24.8h          \n"
-      "mla        v4.8h, v1.8h, v24.8h          \n"
+      "mla        v4.8h, v1.8h, v26.8h          \n"
-      "mla        v4.8h, v2.8h, v26.8h          \n"
+      "mla        v4.8h, v2.8h, v27.8h          \n"
      "mla        v4.8h, v18.8h, v28.8h         \n"
      // U = (128.0 - U) >> 8, V = (128.0 - V) >> 8
      "subhn      v0.8b, v25.8h, v3.8h           \n"
@ -2956,7 +2965,8 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
    "+r"(width)        // %4
  : [c] "r"(c)         // %5
  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26"
+    "v16", "v17", "v18", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
    "v27", "v28"
  );
 }
@ -2974,44 +2984,35 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
+  ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width,
-  asm volatile (
+                         &kArgbJPEGConstants);
-      "movi        v20.8h, #128                  \n"  // UB/VR coeff (0.500)
+}
      "movi        v21.8h, #85                   \n"  // UG coeff (-0.33126)
      "movi        v22.8h, #43                   \n"  // UR coeff (-0.16874)
      "movi        v23.8h, #21                   \n"  // VB coeff (-0.08131)
      "movi        v24.8h, #107                  \n"  // VG coeff (-0.41869)
      "movi        v25.8h, #0x80, lsl #8         \n"  // 128.0 (0x8000 in 16-bit)
      "1:          \n"
      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%0, 448]          \n"
      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%1, 448]          \n"
      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
-      "urshr       v1.8h, v1.8h, #2              \n"
+                      int src_stride_abgr,
-      "urshr       v2.8h, v2.8h, #2              \n"
+                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_u, dst_v, width,
                         &kAbgrI601Constants);
 }
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+                      int src_stride_bgra,
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+                      uint8_t* dst_u,
-      "b.gt        1b                            \n"
+                      uint8_t* dst_v,
-  : "+r"(src_argb),  // %0
+                      int width) {
-    "+r"(src_argb_1),  // %1
+  ARGBToUVMatrixRow_NEON(src_bgra, src_stride_bgra, dst_u, dst_v, width,
-    "+r"(dst_u),     // %2
+                         &kBgraI601Constants);
-    "+r"(dst_v),     // %3
+}
-    "+r"(width)        // %4
+
-  :
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                      int src_stride_rgba,
-    "v20", "v21", "v22", "v23", "v24", "v25"
+                      uint8_t* dst_u,
-  );
+                      uint8_t* dst_v,
                      int width) {
  ARGBToUVMatrixRow_NEON(src_rgba, src_stride_rgba, dst_u, dst_v, width,
                         &kRgbaI601Constants);
 }
 void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
@ -3019,44 +3020,8 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
                       uint8_t* dst_uj,
                       uint8_t* dst_vj,
                       int width) {
-  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
+  ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_uj, dst_vj, width,
-  asm volatile (
+                         &kAbgrJPEGConstants);
      "movi        v20.8h, #128                  \n"  // UB/VR coeff (0.500)
      "movi        v21.8h, #85                   \n"  // UG coeff (-0.33126)
      "movi        v22.8h, #43                   \n"  // UR coeff (-0.16874)
      "movi        v23.8h, #21                   \n"  // VB coeff (-0.08131)
      "movi        v24.8h, #107                  \n"  // VG coeff (-0.41869)
      "movi        v25.8h, #0x80, lsl #8         \n"  // 128.0 (0x8000 in 16-bit)
      "1:          \n"
      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
      "uaddlp      v0.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%0, 448]          \n"
      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uaddlp      v2.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
      "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%1, 448]          \n"
      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
      "urshr       v1.8h, v1.8h, #2              \n"
      "urshr       v2.8h, v2.8h, #2              \n"
    RGBTOUV(v2.8h, v1.8h, v0.8h)
      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
      "b.gt        1b                            \n"
  : "+r"(src_abgr),  // %0
    "+r"(src_abgr_1),  // %1
    "+r"(dst_uj),     // %2
    "+r"(dst_vj),     // %3
    "+r"(width)        // %4
  :
  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
    "v20", "v21", "v22", "v23", "v24", "v25"
  );
 }
 void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
@ -3149,126 +3114,6 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
  );
 }
 void BGRAToUVRow_NEON(const uint8_t* src_bgra,
                      int src_stride_bgra,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
  asm volatile (
    RGBTOUV_SETUP_REG
      "1:          \n"
      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
      "uaddlp      v0.8h, v3.16b                 \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%0, 448]          \n"
      "uaddlp      v3.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uaddlp      v2.8h, v1.16b                 \n"  // R 16 bytes -> 8 shorts.
      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
      "uadalp      v0.8h, v7.16b                 \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%1, 448]          \n"
      "uadalp      v3.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uadalp      v2.8h, v5.16b                 \n"  // R 16 bytes -> 8 shorts.
      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
      "urshr       v1.8h, v3.8h, #2              \n"
      "urshr       v2.8h, v2.8h, #2              \n"
    RGBTOUV(v0.8h, v1.8h, v2.8h)
      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
      "b.gt        1b                            \n"
  : "+r"(src_bgra),  // %0
    "+r"(src_bgra_1),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(width)        // %4
  :
  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
    "v20", "v21", "v22", "v23", "v24", "v25"
  );
 }
 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
                      int src_stride_abgr,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
  asm volatile (
    RGBTOUV_SETUP_REG
      "1:          \n"
      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
      "uaddlp      v3.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%0, 448]          \n"
      "uaddlp      v2.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uaddlp      v1.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
      "uadalp      v3.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%1, 448]          \n"
      "uadalp      v2.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uadalp      v1.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
      "urshr       v0.8h, v3.8h, #2              \n"  // average of 4
      "urshr       v2.8h, v2.8h, #2              \n"
      "urshr       v1.8h, v1.8h, #2              \n"
    RGBTOUV(v0.8h, v2.8h, v1.8h)
      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
      "b.gt        1b                            \n"
  : "+r"(src_abgr),  // %0
    "+r"(src_abgr_1),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(width)        // %4
  :
  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
    "v20", "v21", "v22", "v23", "v24", "v25"
  );
 }
 void RGBAToUVRow_NEON(const uint8_t* src_rgba,
                      int src_stride_rgba,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
  const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
  asm volatile (
    RGBTOUV_SETUP_REG
      "1:          \n"
      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
      "uaddlp      v0.8h, v1.16b                 \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%0, 448]          \n"
      "uaddlp      v1.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uaddlp      v2.8h, v3.16b                 \n"  // R 16 bytes -> 8 shorts.
      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
      "uadalp      v0.8h, v5.16b                 \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%1, 448]          \n"
      "uadalp      v1.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uadalp      v2.8h, v7.16b                 \n"  // R 16 bytes -> 8 shorts.
      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
      "urshr       v1.8h, v1.8h, #2              \n"
      "urshr       v2.8h, v2.8h, #2              \n"
    RGBTOUV(v0.8h, v1.8h, v2.8h)
      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
      "b.gt        1b                            \n"
  : "+r"(src_rgba),  // %0
    "+r"(src_rgba_1),  // %1
    "+r"(dst_u),     // %2
    "+r"(dst_v),     // %3
    "+r"(width)        // %4
  :
  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
    "v20", "v21", "v22", "v23", "v24", "v25"
  );
 }
 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
                       int src_stride_rgb24,
                       uint8_t* dst_u,
@ -3483,18 +3328,19 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
  );
 }
-// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout.
+// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the ArgbConstants layout.
 static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
                                        int src_stride,
                                        uint8_t* dst_u,
                                        uint8_t* dst_v,
                                        int width,
-                                        const int8_t* uvconstants) {
+                                        const struct ArgbConstants* c) {
  const uint8_t* src1 = src + src_stride;
  asm volatile(
      "movi        v23.8h, #0x80, lsl #8           \n"  // 128.0 (0x8000 in
                                                        // 16-bit)
-      "ld2r        {v24.4s, v25.4s}, [%[uvconstants]] \n"
+      "ldr         q24, [%[c], #16]                \n"  // kRGBToU
      "ldr         q25, [%[c], #32]                \n"  // kRGBToV
      "1:          \n"
      "ld2         {v0.4s, v1.4s}, [%[src]], #32   \n"  // load 8 pixels
@ -3547,51 +3393,19 @@ static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
        [dst_u] "+r"(dst_u),            // %[dst_u]
        [dst_v] "+r"(dst_v),            // %[dst_v]
        [width] "+r"(width)             // %[width]
-      : [uvconstants] "r"(uvconstants)  // %[uvconstants]
+      : [c] "r"(c)                      // %[c]
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v23",
        "v24", "v25");
 }
 // RGB to BT601 coefficients
 // UB   0.875 coefficient = 112
 // UG -0.5781 coefficient = -74
 // UR -0.2969 coefficient = -38
 // VB -0.1406 coefficient = -18
 // VG -0.7344 coefficient = -94
 // VR   0.875 coefficient = 112
 // I8MM constants are stored negated such that we can store 128 in int8_t.
 static const int8_t kARGBToUVCoefficients[] = {
    // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
    -112, 74, 38, 0, 18, 94, -112, 0,
 };
 static const int8_t kABGRToUVCoefficients[] = {
    // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
    38, 74, -112, 0, -112, 94, 18, 0,
 };
 static const int8_t kBGRAToUVCoefficients[] = {
    // 0, -UR, -UG, -UB, 0, -VR, -VG, -VB
    0, 38, 74, -112, 0, -112, 94, 18,
 };
 static const int8_t kRGBAToUVCoefficients[] = {
    // 0, -UB, -UG, -UR, 0, -VB, -VG, -VR
    0, -112, 74, 38, 0, 18, 94, -112,
 };
 void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
                                 int src_stride_argb,
                                 uint8_t* dst_u,
                                 uint8_t* dst_v,
                                 int width,
                                 const struct ArgbConstants* c) {
  int8_t uvconstants[8] = {
      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
-                                   uvconstants);
+                                   c);
 }
 void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
@ -3600,7 +3414,7 @@ void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
                           uint8_t* dst_v,
                           int width) {
  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
-                              kARGBToUVCoefficients);
+                              &kArgbI601Constants);
 }
 void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
@ -3609,7 +3423,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
                           uint8_t* dst_v,
                           int width) {
  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
-                              kABGRToUVCoefficients);
+                              &kAbgrI601Constants);
 }
 void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
@ -3618,7 +3432,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
                           uint8_t* dst_v,
                           int width) {
  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width,
-                              kBGRAToUVCoefficients);
+                              &kBgraI601Constants);
 }
 void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
@ -3627,35 +3441,16 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
                           uint8_t* dst_v,
                           int width) {
  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width,
-                              kRGBAToUVCoefficients);
+                              &kRgbaI601Constants);
 }
 // RGB to JPEG coefficients
 // UB  0.500    coefficient = 128
 // UG -0.33126  coefficient = -85
 // UR -0.16874  coefficient = -43
 // VB -0.08131  coefficient = -21
 // VG -0.41869  coefficient = -107
 // VR 0.500     coefficient = 128
 // I8MM constants are stored negated such that we can store 128 in int8_t.
 static const int8_t kARGBToUVJCoefficients[] = {
    // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
    -128, 85, 43, 0, 21, 107, -128, 0,
 };
 static const int8_t kABGRToUVJCoefficients[] = {
    // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
    43, 85, -128, 0, -128, 107, 21, 0,
 };
 void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
                            int src_stride_argb,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
-                              kARGBToUVJCoefficients);
+                              &kArgbJPEGConstants);
 }
 void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
@ -3664,7 +3459,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
                            uint8_t* dst_v,
                            int width) {
  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
-                              kABGRToUVJCoefficients);
+                              &kAbgrJPEGConstants);
 }
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@ -3771,206 +3566,145 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
                                  int width,
                                  const struct ArgbConstants* c) {
  asm volatile(
-      "ldr         s0, [%3]                      \n"  // load rgbconstants
+      "ldr         s16, [%3]                     \n"  // load 4 coeffs
-      "ldr         s1, [%3, #48]                 \n"
+      "ldr         s17, [%3, #48]                \n"  // load kAddY[0]
-      "dup         v6.16b, v0.b[0]               \n"
+      "dup         v18.16b, v16.b[0]             \n"  // B
-      "dup         v7.16b, v0.b[1]               \n"
+      "dup         v19.16b, v16.b[1]             \n"  // G
-      "dup         v16.16b, v0.b[2]              \n"
+      "dup         v20.16b, v16.b[2]             \n"  // R
-      "dup         v17.8h,  v1.h[0]              \n"
+      "dup         v21.16b, v16.b[3]             \n"  // A
      "dup         v22.8h,  v17.h[0]             \n"  // bias
      "1:          \n"
      "ld4         {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n"  // load 16
                                                                 // pixels.
      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
-      "umull       v0.8h, v2.8b, v6.8b           \n"  // B
+      "umull       v0.8h, v2.8b, v18.8b          \n"  // B
-      "umull2      v1.8h, v2.16b, v6.16b         \n"
+      "umull2      v1.8h, v2.16b, v18.16b        \n"
      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v0.8h, v3.8b, v7.8b           \n"  // G
+      "umlal       v0.8h, v3.8b, v19.8b          \n"  // G
-      "umlal2      v1.8h, v3.16b, v7.16b         \n"
+      "umlal2      v1.8h, v3.16b, v19.16b        \n"
-      "umlal       v0.8h, v4.8b, v16.8b          \n"  // R
+      "umlal       v0.8h, v4.8b, v20.8b          \n"  // R
-      "umlal2      v1.8h, v4.16b, v16.16b        \n"
+      "umlal2      v1.8h, v4.16b, v20.16b        \n"
-      "addhn       v0.8b, v0.8h, v17.8h          \n"  // 16 bit to 8 bit Y
+      "umlal       v0.8h, v5.8b, v21.8b          \n"  // A
-      "addhn       v1.8b, v1.8h, v17.8h          \n"
+      "umlal2      v1.8h, v5.16b, v21.16b        \n"
      "addhn       v0.8b, v0.8h, v22.8h          \n"  // 16 bit to 8 bit Y
      "addhn       v1.8b, v1.8h, v22.8h          \n"
      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
      "b.gt        1b                            \n"
      : "+r"(src_argb),    // %0
        "+r"(dst_y),       // %1
        "+r"(width)        // %2
      : "r"(c)             // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
-        "v17");
+        "v19", "v20", "v21", "v22");
 }
 void ARGBToYMatrixRow_NEON_DotProd(
    const uint8_t* src_argb,
    uint8_t* dst_y,
    int width,
    const struct ArgbConstants* c) {
  asm volatile(
-      "ldr         s0, [%3]                      \n"  // load rgbconstants
+      "ldr         s16, [%3]                     \n"  // load 4 coeffs
-      "ldr         s1, [%3, #48]                 \n"
+      "ldr         s17, [%3, #48]                \n"  // load kAddY[0]
-      "dup         v16.4s, v0.s[0]               \n"
+      "dup         v18.4s, v16.s[0]              \n"
-      "dup         v17.8h,  v1.h[0]              \n"
+      "dup         v19.8h, v17.h[0]              \n"
      "1:          \n"
      "ld1         {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n"  // load 16
                                                                    // pixels.
      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
      "movi        v0.16b, #0                    \n"
      "movi        v1.16b, #0                    \n"
      "movi        v2.16b, #0                    \n"
      "movi        v3.16b, #0                    \n"
-      "udot        v0.4s, v4.16b, v16.16b        \n"
+      "udot        v0.4s, v4.16b, v18.16b        \n"
-      "udot        v1.4s, v5.16b, v16.16b        \n"
+      "udot        v1.4s, v5.16b, v18.16b        \n"
-      "udot        v2.4s, v6.16b, v16.16b        \n"
+      "udot        v2.4s, v6.16b, v18.16b        \n"
-      "udot        v3.4s, v7.16b, v16.16b        \n"
+      "udot        v3.4s, v7.16b, v18.16b        \n"
      "uzp1        v0.8h, v0.8h, v1.8h           \n"
      "uzp1        v1.8h, v2.8h, v3.8h           \n"
-      "addhn       v0.8b, v0.8h, v17.8h          \n"
+      "addhn       v0.8b, v0.8h, v19.8h          \n"
-      "addhn       v1.8b, v1.8h, v17.8h          \n"
+      "addhn       v1.8b, v1.8h, v19.8h          \n"
      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
      "b.gt        1b                            \n"
      : "+r"(src_argb),    // %0
        "+r"(dst_y),       // %1
        "+r"(width)        // %2
      : "r"(c)             // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19");
        "v17");
 }
 // RGB to JPeg coefficients
 // B * 0.1140 coefficient = 29
 // G * 0.5870 coefficient = 150
 // R * 0.2990 coefficient = 77
 // Add 0.5
 static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {}, {}, {0x0080}, {}};
 static const struct ArgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, {}, {}, {0x0080}, {}};
 static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {}, {}, {0x0080}, {}};
 // RGB to BT.601 coefficients
 // B * 0.1016 coefficient = 25
 // G * 0.5078 coefficient = 129
 // R * 0.2578 coefficient = 66
 // Add 16.5 = 0x1080
 static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {}, {}, {0x1080}, {}};
 static const struct ArgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66}, {}, {}, {0x1080}, {}};
 static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {}, {}, {0x1080}, {}};
 static const struct ArgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25}, {}, {}, {0x1080}, {}};
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
+  ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kArgbI601Constants);
 }
 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
-  ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+  ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kArgbJPEGConstants);
 }
 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
+  ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kAbgrI601Constants);
 }
 void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
-  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
+  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
 }
 void ARGBToYRow_NEON_DotProd(const uint8_t* src_argb,
                             uint8_t* dst_y,
                             int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kRgb24I601Constants);
+  ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kArgbI601Constants);
 }
 void ARGBToYJRow_NEON_DotProd(const uint8_t* src_argb,
                              uint8_t* dst_yj,
                              int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+  ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kArgbJPEGConstants);
 }
 void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr,
                             uint8_t* dst_y,
                             int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kRawI601Constants);
+  ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kAbgrI601Constants);
 }
 void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr,
                              uint8_t* dst_yj,
                              int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kRawJPEGConstants);
+  ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
 }
 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
 // Same code as ARGB, except the LD4
 static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
                                  uint8_t* dst_y,
                                  int width,
                                  const struct ArgbConstants* c) {
  asm volatile(
      "ldr         s0, [%3]                      \n"  // load rgbconstants
      "ldr         s1, [%3, #48]                 \n"
      "dup         v6.16b, v0.b[0]               \n"
      "dup         v7.16b, v0.b[1]               \n"
      "dup         v16.16b, v0.b[2]              \n"
      "dup         v17.8h,  v1.h[0]              \n"
      "1:          \n"
      "ld4         {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n"  // load 16
                                                                 // pixels.
      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
      "umull       v0.8h, v2.8b, v6.8b           \n"  // B
      "umull2      v1.8h, v2.16b, v6.16b         \n"
      "prfm        pldl1keep, [%0, 448]          \n"
      "umlal       v0.8h, v3.8b, v7.8b           \n"  // G
      "umlal2      v1.8h, v3.16b, v7.16b         \n"
      "umlal       v0.8h, v4.8b, v16.8b          \n"  // R
      "umlal2      v1.8h, v4.16b, v16.16b        \n"
      "addhn       v0.8b, v0.8h, v17.8h          \n"  // 16 bit to 8 bit Y
      "addhn       v1.8b, v1.8h, v17.8h          \n"
      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
      "b.gt        1b                            \n"
      : "+r"(src_rgba),    // %0
        "+r"(dst_y),       // %1
        "+r"(width)        // %2
      : "r"(c)  // %3
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
        "v17");
 }
 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
+  ARGBToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgbaI601Constants);
 }
 void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
-  RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+  ARGBToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
 }
 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
+  ARGBToYMatrixRow_NEON(src_bgra, dst_y, width, &kBgraI601Constants);
 }
 void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba,
                             uint8_t* dst_y,
                             int width) {
-  // No need for a separate implementation for RGBA inputs, just permute the
+  ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width, &kRgbaI601Constants);
  // RGB constants.
  ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width,
                                &kRgb24I601DotProdConstants);
 }
 void RGBAToYJRow_NEON_DotProd(const uint8_t* src_rgba,
                              uint8_t* dst_yj,
                              int width) {
-  // No need for a separate implementation for RGBA inputs, just permute the
+  ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
  // RGB constants.
  ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width,
                                &kRgb24JPEGDotProdConstants);
 }
 void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra,
                             uint8_t* dst_y,
                             int width) {
-  // No need for a separate implementation for RGBA inputs, just permute the
+  ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width, &kBgraI601Constants);
  // RGB constants.
  ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width,
                                &kRawI601DotProdConstants);
 }
 void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
@ -3978,30 +3712,32 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                                 int width,
                                 const struct ArgbConstants* c) {
  asm volatile(
-      "ldr         d0, [%3]                      \n"  // load rgbconstants
+      "ldr         s16, [%3]                     \n"  // load 4 coeffs
-      "dup         v5.16b, v0.b[0]               \n"
+      "ldr         s17, [%3, #48]                \n"  // load kAddY[0]
-      "dup         v6.16b, v0.b[1]               \n"
+      "dup         v18.16b, v16.b[0]             \n"  // B
-      "dup         v7.16b, v0.b[2]               \n"
+      "dup         v19.16b, v16.b[1]             \n"  // G
-      "dup         v16.8h,  v0.h[2]              \n"
+      "dup         v20.16b, v16.b[2]             \n"  // R
      "dup         v21.8h,  v17.h[0]             \n"  // bias
      "1:          \n"
      "ld3         {v2.16b,v3.16b,v4.16b}, [%0], #48 \n"  // load 16 pixels.
      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
-      "umull       v0.8h, v2.8b, v5.8b           \n"  // B
+      "umull       v0.8h, v2.8b, v18.8b          \n"  // B
-      "umull2      v1.8h, v2.16b, v5.16b         \n"
+      "umull2      v1.8h, v2.16b, v18.16b        \n"
      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v0.8h, v3.8b, v6.8b           \n"  // G
+      "umlal       v0.8h, v3.8b, v19.8b          \n"  // G
-      "umlal2      v1.8h, v3.16b, v6.16b         \n"
+      "umlal2      v1.8h, v3.16b, v19.16b        \n"
-      "umlal       v0.8h, v4.8b, v7.8b           \n"  // R
+      "umlal       v0.8h, v4.8b, v20.8b          \n"  // R
-      "umlal2      v1.8h, v4.16b, v7.16b         \n"
+      "umlal2      v1.8h, v4.16b, v20.16b        \n"
-      "addhn       v0.8b, v0.8h, v16.8h          \n"  // 16 bit to 8 bit Y
+      "addhn       v0.8b, v0.8h, v21.8h          \n"  // 16 bit to 8 bit Y
-      "addhn       v1.8b, v1.8h, v16.8h          \n"
+      "addhn       v1.8b, v1.8h, v21.8h          \n"
      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
      "b.gt        1b                            \n"
      : "+r"(src_rgb),     // %0
        "+r"(dst_y),       // %1
        "+r"(width)        // %2
      : "r"(c)  // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18",
        "v19", "v20", "v21");
 }