ABGRToJ420 call ARGBToI420Matrix

- Standardize libyuv ARGB-family (ARGB, ABGR, RGBA, BGRA) to YUV conversion by utilizing the generic MatrixRow architecture and explicit ArgbConstants. - Consolidated ARGBToI420, ABGRToI420, BGRAToI420, and RGBAToI420 as wrappers for ARGBToI420Matrix. - Refactored ABGRToJ420, ABGRToJ422, and ABGRToI422 to use generic matrix functions. - Added matrix-based versions for NV21, I400, YUY2, and UYVY. - Updated RAW and RGB24 to I420/I422/I444 dispatchers to use MatrixRow logic and explicit constants. - Fixed parameter swap bugs in ARGBToI422, ARGBToJ422, and ABGRToJ422. - Fixed a bug in the generic C implementation of matrix row functions ensuring all 4 channels are processed correctly for all ARGB-family formats. - Moved kShuffleAARRGGBB in row_gcc.cc to the top of the libyuv namespace for visibility. - Cleaned up redundant format-specific row implementations. Bug: libyuv:42280902 Change-Id: I67ffa4c476abc0d2dcc4650510d7bda91b65988e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7830291 Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
2026-07-30 16:26:19 +08:00 · 2026-05-07 19:58:19 -07:00 · 2026-05-07 19:58:19 -07:00 · 4b4e68b372
commit 4b4e68b372
parent 4aacbbdfb4
10 changed files with 2858 additions and 3678 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1936
+Version: 1937
 Revision: DEPS
 License: BSD-3-Clause
 License File: LICENSE
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@ -875,6 +875,19 @@ int BGRAToI420(const uint8_t* src_bgra,
               int width,
               int height);

+// BGRA little endian (argb in memory) to I422.
+LIBYUV_API
+int BGRAToI422(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 // ABGR little endian (rgba in memory) to I420.
 LIBYUV_API
 int ABGRToI420(const uint8_t* src_abgr,
@ -888,6 +901,19 @@ int ABGRToI420(const uint8_t* src_abgr,
               int width,
               int height);

+// ABGR little endian (rgba in memory) to I422.
+LIBYUV_API
+int ABGRToI422(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 // RGBA little endian (abgr in memory) to I420.
 LIBYUV_API
 int RGBAToI420(const uint8_t* src_rgba,
@ -901,6 +927,19 @@ int RGBAToI420(const uint8_t* src_rgba,
               int width,
               int height);

+// RGBA little endian (abgr in memory) to I422.
+LIBYUV_API
+int RGBAToI422(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 // RGB little endian (bgr in memory) to I420.
 LIBYUV_API
 int RGB24ToI420(const uint8_t* src_rgb24,
--- a/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@ -245,6 +245,19 @@ int ARGBToI422(const uint8_t* src_argb,
               int width,
               int height);

+// Convert ABGR To I422.
+LIBYUV_API
+int ABGRToI422(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 // RGB to I444 with matrix. See ArgbConstants at the top of this file for usage.
 LIBYUV_API
 int ARGBToI422Matrix(const uint8_t* src_argb,
@ -458,7 +471,7 @@ int ARGBToUYVY(const uint8_t* src_argb,

 // RAW to NV21 with Matrix
 LIBYUV_API
-int RGBToNV21Matrix(const uint8_t* src_raw,
+int RAWToNV21Matrix(const uint8_t* src_raw,
                    int src_stride_raw,
                    uint8_t* dst_y,
                    int dst_stride_y,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1936
+#define LIBYUV_VERSION 1937

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#include "libyuv/convert_from_argb.h"  // For ArgbConstants
 #include "libyuv/planar_functions.h"

 #include <assert.h>
@ -15,12 +16,10 @@

 #include "libyuv/cpu_id.h"
 #include "libyuv/row.h"
-#include "libyuv/convert_from_argb.h"
 #include "libyuv/scale_row.h"  // for ScaleRowDown2

 #ifdef __cplusplus
 namespace libyuv {
-
 extern "C" {
 #endif

@ -4745,8 +4744,8 @@ static int ARGBSobelize(const uint8_t* src_argb,
                                         uint8_t* dst,
                                         int width)) {
  int y;
-  void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
-                           const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =
+      ARGBToYJRow_C;
  void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,
                    uint8_t* dst_sobely, int width) = SobelYRow_C;
  void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,
@ -4763,65 +4762,57 @@ static int ARGBSobelize(const uint8_t* src_argb,
    src_stride_argb = -src_stride_argb;
  }

-#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
    }
  }
 #endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
+#if defined(HAS_ARGBTOYJROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
+      ARGBToYJRow = ARGBToYJRow_AVX2;
    }
  }
 #endif
-#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
+#if defined(HAS_ARGBTOYROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
+    ARGBToYJRow = ARGBToYJRow_Any_AVX512BW;
    if (IS_ALIGNED(width, 64)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
+      ARGBToYJRow = ARGBToYJRow_AVX512BW;
    }
  }
 #endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+#if defined(HAS_ARGBTOYJROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+      ARGBToYJRow = ARGBToYJRow_NEON;
    }
  }
 #endif
-#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
-  if (TestCpuFlag(kCpuHasNeonDotProd)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYMATRIXROW_LSX)
+#if defined(HAS_ARGBTOYJROW_LSX)
  if (TestCpuFlag(kCpuHasLSX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
+    ARGBToYJRow = ARGBToYJRow_Any_LSX;
    if (IS_ALIGNED(width, 16)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
+      ARGBToYJRow = ARGBToYJRow_LSX;
    }
  }
 #endif
-#if defined(HAS_ARGBTOYMATRIXROW_LASX)
+#if defined(HAS_ARGBTOYJROW_LASX)
  if (TestCpuFlag(kCpuHasLASX)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
+    ARGBToYJRow = ARGBToYJRow_Any_LASX;
    if (IS_ALIGNED(width, 32)) {
-      ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
+      ARGBToYJRow = ARGBToYJRow_LASX;
    }
  }
 #endif
-#if defined(HAS_ARGBTOYMATRIXROW_RVV)
+#if defined(HAS_ARGBTOYJROW_RVV)
  if (TestCpuFlag(kCpuHasRVV)) {
-    ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
+    ARGBToYJRow = ARGBToYJRow_RVV;
  }
 #endif

@ -4859,10 +4850,10 @@ static int ARGBSobelize(const uint8_t* src_argb,
    uint8_t* row_y2 = row_y1 + row_size;
    if (!rows)
      return 1;
-    ARGBToYMatrixRow(src_argb, row_y0, width, &kArgbJPEGConstants);
+    ARGBToYJRow(src_argb, row_y0, width);
    row_y0[-1] = row_y0[0];
    memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
-    ARGBToYMatrixRow(src_argb, row_y1, width, &kArgbJPEGConstants);
+    ARGBToYJRow(src_argb, row_y1, width);
    row_y1[-1] = row_y1[0];
    memset(row_y1 + width, row_y1[width - 1], 16);
    memset(row_y2 + width, 0, 16);
@ -4872,7 +4863,7 @@ static int ARGBSobelize(const uint8_t* src_argb,
      if (y < (height - 1)) {
        src_argb += src_stride_argb;
      }
-      ARGBToYMatrixRow(src_argb, row_y2, width, &kArgbJPEGConstants);
+      ARGBToYJRow(src_argb, row_y2, width);
      row_y2[-1] = row_y2[0];
      row_y2[width] = row_y2[width - 1];

--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -753,28 +753,31 @@ MAKEROWYJ(ABGR, 0, 1, 2, 4)
 MAKEROWYJ(RGBA, 3, 2, 1, 4)
 #undef MAKEROWYJ

-static __inline uint8_t RGBToYMatrix(uint8_t r,
-                                     uint8_t g,
-                                     uint8_t b,
+static __inline uint8_t RGBToYMatrix(uint8_t b0,
+                                     uint8_t b1,
+                                     uint8_t b2,
+                                     uint8_t b3,
                                     const struct ArgbConstants* c) {
-  return (c->kRGBToY[2] * r + c->kRGBToY[1] * g + c->kRGBToY[0] * b +
-          c->kAddY[0]) >>
+  return (c->kRGBToY[0] * b0 + c->kRGBToY[1] * b1 + c->kRGBToY[2] * b2 +
+          c->kRGBToY[3] * b3 + c->kAddY[0]) >>
         8;
 }
-static __inline uint8_t RGBToUMatrix(uint8_t r,
-                                     uint8_t g,
-                                     uint8_t b,
+static __inline uint8_t RGBToUMatrix(uint8_t b0,
+                                     uint8_t b1,
+                                     uint8_t b2,
+                                     uint8_t b3,
                                     const struct ArgbConstants* c) {
-  return (c->kAddUV[0] -
-          (c->kRGBToU[2] * r + c->kRGBToU[1] * g + c->kRGBToU[0] * b)) >>
+  return (c->kAddUV[0] - (c->kRGBToU[0] * b0 + c->kRGBToU[1] * b1 +
+                         c->kRGBToU[2] * b2 + c->kRGBToU[3] * b3)) >>
         8;
 }
-static __inline uint8_t RGBToVMatrix(uint8_t r,
-                                     uint8_t g,
-                                     uint8_t b,
+static __inline uint8_t RGBToVMatrix(uint8_t b0,
+                                     uint8_t b1,
+                                     uint8_t b2,
+                                     uint8_t b3,
                                     const struct ArgbConstants* c) {
-  return (c->kAddUV[0] -
-          (c->kRGBToV[2] * r + c->kRGBToV[1] * g + c->kRGBToV[0] * b)) >>
+  return (c->kAddUV[0] - (c->kRGBToV[0] * b0 + c->kRGBToV[1] * b1 +
+                         c->kRGBToV[2] * b2 + c->kRGBToV[3] * b3)) >>
         8;
 }

@ -784,7 +787,7 @@ void ARGBToYMatrixRow_C(const uint8_t* src_argb,
                        const struct ArgbConstants* c) {
  int x;
  for (x = 0; x < width; ++x) {
-    dst_y[0] = RGBToYMatrix(src_argb[2], src_argb[1], src_argb[0], c);
+    dst_y[0] = RGBToYMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
    src_argb += 4;
    dst_y += 1;
  }
@ -799,25 +802,28 @@ void ARGBToUVMatrixRow_C(const uint8_t* src_argb,
  const uint8_t* src_argb1 = src_argb + src_stride_argb;
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    uint8_t ab =
+    uint8_t b0 =
        (src_argb[0] + src_argb[4] + src_argb1[0] + src_argb1[4] + 2) >> 2;
-    uint8_t ag =
+    uint8_t b1 =
        (src_argb[1] + src_argb[5] + src_argb1[1] + src_argb1[5] + 2) >> 2;
-    uint8_t ar =
+    uint8_t b2 =
        (src_argb[2] + src_argb[6] + src_argb1[2] + src_argb1[6] + 2) >> 2;
-    dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
-    dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
+    uint8_t b3 =
+        (src_argb[3] + src_argb[7] + src_argb1[3] + src_argb1[7] + 2) >> 2;
+    dst_u[0] = RGBToUMatrix(b0, b1, b2, b3, c);
+    dst_v[0] = RGBToVMatrix(b0, b1, b2, b3, c);
    src_argb += 8;
    src_argb1 += 8;
    dst_u += 1;
    dst_v += 1;
  }
  if (width & 1) {
-    uint8_t ab = (src_argb[0] + src_argb1[0] + 1) >> 1;
-    uint8_t ag = (src_argb[1] + src_argb1[1] + 1) >> 1;
-    uint8_t ar = (src_argb[2] + src_argb1[2] + 1) >> 1;
-    dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
-    dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
+    uint8_t b0 = (src_argb[0] + src_argb1[0] + 1) >> 1;
+    uint8_t b1 = (src_argb[1] + src_argb1[1] + 1) >> 1;
+    uint8_t b2 = (src_argb[2] + src_argb1[2] + 1) >> 1;
+    uint8_t b3 = (src_argb[3] + src_argb1[3] + 1) >> 1;
+    dst_u[0] = RGBToUMatrix(b0, b1, b2, b3, c);
+    dst_v[0] = RGBToVMatrix(b0, b1, b2, b3, c);
  }
 }

@ -828,11 +834,10 @@ void ARGBToUV444MatrixRow_C(const uint8_t* src_argb,
                            const struct ArgbConstants* c) {
  int x;
  for (x = 0; x < width; ++x) {
-    uint8_t ab = src_argb[0];
-    uint8_t ag = src_argb[1];
-    uint8_t ar = src_argb[2];
-    dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
-    dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
+    dst_u[0] =
+        RGBToUMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
+    dst_v[0] =
+        RGBToVMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
    src_argb += 4;
    dst_u += 1;
    dst_v += 1;
@ -1513,16 +1518,16 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
      YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);

 #define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV)   \
-  const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) =            \
+  extern const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) =     \
      ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV),   \
                        -(RV), 0, AY, AUV);                                    \
-  const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) =            \
+  extern const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) =     \
      ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV),   \
                        -(BV), 0, AY, AUV);                                    \
-  const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) =            \
+  extern const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) =     \
      ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV),       \
                        -(GV), -(RV), AY, AUV);                                \
-  const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) =            \
+  extern const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) =     \
      ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV),       \
                        -(GV), -(BV), AY, AUV);

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -1848,32 +1848,41 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
                               int width,
                               const struct ArgbConstants* c) {
  asm volatile(
-      "vld1.8      {d16}, [%4]                   \n"  // load kRGBToU
-      "vld1.8      {d17}, [%5]                   \n"  // load kRGBToV
-      "vld1.16     {d18[0]}, [%6]                \n"  // load kAddUV[0]
-      "vabs.s8     d16, d16                      \n"  // BU, GU, RU
-      "vabs.s8     d17, d17                      \n"  // BV, GV, RV
-      "vdup.8      d20, d16[0]                   \n"  // BU
-      "vdup.8      d21, d16[1]                   \n"  // GU
-      "vdup.8      d22, d16[2]                   \n"  // RU
-      "vdup.8      d23, d17[0]                   \n"  // BV
-      "vdup.8      d24, d17[1]                   \n"  // GV
-      "vdup.8      d25, d17[2]                   \n"  // RV
-      "vdup.16     q15, d18[0]                   \n"  // kAddUV
-
+      "vld1.8      {d24}, [%4]                   \n"  // load kRGBToU
+      "vld1.8      {d25}, [%5]                   \n"  // load kRGBToV
+      "vld1.16     {d26[0]}, [%6]                \n"  // load kAddUV[0]
+      "vmovl.s8    q10, d24                      \n"  // U coeffs (8 shorts)
+      "vmovl.s8    q11, d25                      \n"  // V coeffs (8 shorts)
+      "vdup.16     q6, d26[0]                    \n"  // bias
      "1:          \n"
      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
      "subs        %3, %3, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q2, d0, d20                   \n"  // B * BU
-      "vmlsl.u8    q2, d1, d21                   \n"  // - G * GU
-      "vmlsl.u8    q2, d2, d22                   \n"  // - R * RU

-      "vmull.u8    q3, d2, d25                   \n"  // R * RV
-      "vmlsl.u8    q3, d1, d24                   \n"  // - G * GV
-      "vmlsl.u8    q3, d0, d23                   \n"  // - B * BV
+      "vmovl.u8    q4, d0                        \n"  // B
+      "vmovl.u8    q5, d1                        \n"  // G
+      "vmovl.u8    q7, d2                        \n"  // R
+      "vmovl.u8    q8, d3                        \n"  // A

-      "vaddhn.u16  d0, q2, q15                   \n"  // signed -> unsigned
-      "vaddhn.u16  d1, q3, q15                   \n"
+      "vdup.16     q12, d20[0]                   \n"
+      "vmul.s16    q2, q4, q12                   \n"  // U = B * U0
+      "vdup.16     q12, d20[1]                   \n"
+      "vmla.s16    q2, q5, q12                   \n"  // U += G * U1
+      "vdup.16     q12, d20[2]                   \n"
+      "vmla.s16    q2, q7, q12                   \n"  // U += R * U2
+      "vdup.16     q12, d20[3]                   \n"
+      "vmla.s16    q2, q8, q12                   \n"  // U += A * U3
+
+      "vdup.16     q12, d22[0]                   \n"
+      "vmul.s16    q3, q4, q12                   \n"  // V = B * V0
+      "vdup.16     q12, d22[1]                   \n"
+      "vmla.s16    q3, q5, q12                   \n"  // V += G * V1
+      "vdup.16     q12, d22[2]                   \n"
+      "vmla.s16    q3, q7, q12                   \n"  // V += R * V2
+      "vdup.16     q12, d22[3]                   \n"
+      "vmla.s16    q3, q8, q12                   \n"  // V += A * V3
+
+      "vsubhn.s16  d0, q6, q2                    \n"  // 128.0 - U
+      "vsubhn.s16  d1, q6, q3                    \n"  // 128.0 - V

      "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels U.
      "vst1.8      {d1}, [%2]!                   \n"  // store 8 pixels V.
@ -1885,8 +1894,8 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
      : "r"(&c->kRGBToU),   // %4
        "r"(&c->kRGBToV),   // %5
        "r"(&c->kAddUV)     // %6
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
-        "q12", "q13", "q14", "q15");
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q10", "q11", "q12");
 }

 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
@ -1926,16 +1935,11 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
                            const struct ArgbConstants* c) {
  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
  asm volatile (
-      "vld1.8      {d18}, [%5]                   \n"  // load kRGBToU
-      "vld1.8      {d19}, [%6]                   \n"  // load kRGBToV
-      "vmovl.s8    q8, d18                       \n"  // U coeffs in q8 (d16, d17)
-      "vmovl.s8    q9, d19                       \n"  // V coeffs in q9 (d18, d19)
-      "vdup.16     q10, d16[0]                   \n"  // U0
-      "vdup.16     q11, d16[1]                   \n"  // U1
-      "vdup.16     q12, d16[2]                   \n"  // U2
-      "vdup.16     q13, d18[0]                   \n"  // V0
-      "vdup.16     q14, d18[1]                   \n"  // V1
-      "vdup.16     q15, d18[2]                   \n"  // V2
+      "vld1.8      {d24}, [%5]                   \n"  // load kRGBToU (8 bytes, only 4 used)
+      "vld1.8      {d25}, [%6]                   \n"  // load kRGBToV
+      "vmovl.s8    q14, d24                      \n"  // U coeffs in d28
+      "vmovl.s8    q15, d25                      \n"  // V coeffs in d30
+      "vmov.u16    q11, #0x8000                  \n"  // 128.0 bias

      "1:          \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
@ -1944,28 +1948,39 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
      "vpaddl.u8   q0, q0                        \n"  // B 16 bytes -> 8 shorts.
      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
      "vpaddl.u8   q2, q2                        \n"  // R 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ARGB pixels.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ARGB pixels.
-      "vpadal.u8   q0, q4                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
+      "vpaddl.u8   q3, q3                        \n"  // A 16 bytes -> 8 shorts.
+      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more pixels.
+      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"
+      "vpadal.u8   q0, q4                        \n"  // B
+      "vpadal.u8   q1, q5                        \n"  // G
+      "vpadal.u8   q2, q6                        \n"  // R
+      "vpadal.u8   q3, q7                        \n"  // A

      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
      "vrshr.u16   q1, q1, #2                    \n"
      "vrshr.u16   q2, q2, #2                    \n"
+      "vrshr.u16   q3, q3, #2                    \n"

-      "vmov.u16    q3, #0x8000                   \n"  // 128.0
-
-      "vmul.s16    q8, q0, q10                   \n"  // U = B * U0
-      "vmla.s16    q8, q1, q11                   \n"  // U += G * U1
+      "vdup.16     q12, d28[0]                   \n"
+      "vmul.s16    q8, q0, q12                   \n"  // U = B * U0
+      "vdup.16     q12, d28[1]                   \n"
+      "vmla.s16    q8, q1, q12                   \n"  // U += G * U1
+      "vdup.16     q12, d28[2]                   \n"
      "vmla.s16    q8, q2, q12                   \n"  // U += R * U2
+      "vdup.16     q12, d28[3]                   \n"
+      "vmla.s16    q8, q3, q12                   \n"  // U += A * U3

-      "vmul.s16    q9, q0, q13                   \n"  // V = B * V0
-      "vmla.s16    q9, q1, q14                   \n"  // V += G * V1
-      "vmla.s16    q9, q2, q15                   \n"  // V += R * V2
+      "vdup.16     q12, d30[0]                   \n"
+      "vmul.s16    q9, q0, q12                   \n"  // V = B * V0
+      "vdup.16     q12, d30[1]                   \n"
+      "vmla.s16    q9, q1, q12                   \n"  // V += G * V1
+      "vdup.16     q12, d30[2]                   \n"
+      "vmla.s16    q9, q2, q12                   \n"  // V += R * V2
+      "vdup.16     q12, d30[3]                   \n"
+      "vmla.s16    q9, q3, q12                   \n"  // V += A * V3

-      "vsubhn.s16  d0, q3, q8                    \n"  // 128.0 - U
-      "vsubhn.s16  d1, q3, q9                    \n"  // 128.0 - V
+      "vsubhn.s16  d0, q11, q8                   \n"  // 128.0 - U
+      "vsubhn.s16  d1, q11, q9                   \n"  // 128.0 - V

      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
@ -1978,7 +1993,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
  : "r"(&c->kRGBToU),  // %5
    "r"(&c->kRGBToV)   // %6
  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    "q8", "q9", "q11", "q12", "q14", "q15"
  );
 }

@ -2212,44 +2227,8 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile (
-      "add         %1, %0, %1                    \n"  // src_stride + src_bgra
-      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
-      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
-      "vmov.u16    q15, #0x8000                  \n"  // 128.0
-      "1:          \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 BGRA pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 BGRA pixels.
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
-      "vpaddl.u8   q3, q3                        \n"  // B 16 bytes -> 8 shorts.
-      "vpaddl.u8   q2, q2                        \n"  // G 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q1                        \n"  // R 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more BGRA pixels.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 BGRA pixels.
-      "vpadal.u8   q3, q7                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q2, q6                        \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q5                        \n"  // R 16 bytes -> 8 shorts.
-
-      "vrshr.u16   q1, q1, #2                    \n"  // average of 4
-      "vrshr.u16   q2, q2, #2                    \n"
-      "vrshr.u16   q3, q3, #2                    \n"
-
-    RGBTOUV(q3, q2, q1)
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(src_stride_bgra),  // %1
-    "+r"(dst_u),     // %2-
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  ARGBToUVMatrixRow_NEON(src_bgra, src_stride_bgra, dst_u, dst_v, width,
+                         &kBgraI601Constants);
 }

 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
@ -2257,44 +2236,8 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile (
-      "add         %1, %0, %1                    \n"  // src_stride + src_abgr
-      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
-      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
-      "vmov.u16    q15, #0x8000                  \n"  // 128.0
-      "1:          \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ABGR pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ABGR pixels.
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
-      "vpaddl.u8   q2, q2                        \n"  // B 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q1                        \n"  // G 16 bytes -> 8 shorts.
-      "vpaddl.u8   q0, q0                        \n"  // R 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more ABGR pixels.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 ABGR pixels.
-      "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
-
-      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
-      "vrshr.u16   q1, q1, #2                    \n"
-      "vrshr.u16   q2, q2, #2                    \n"
-
-    RGBTOUV(q2, q1, q0)
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(src_stride_abgr),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_u, dst_v, width,
+                         &kAbgrI601Constants);
 }

 void RGBAToUVRow_NEON(const uint8_t* src_rgba,
@ -2302,44 +2245,8 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile (
-      "add         %1, %0, %1                    \n"  // src_stride + src_rgba
-      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
-      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
-      "vmov.u16    q15, #0x8000                  \n"  // 128.0
-      "1:          \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 RGBA pixels.
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 RGBA pixels.
-      "subs        %4, %4, #16                   \n"  // 16 processed per loop.
-      "vpaddl.u8   q0, q1                        \n"  // B 16 bytes -> 8 shorts.
-      "vpaddl.u8   q1, q2                        \n"  // G 16 bytes -> 8 shorts.
-      "vpaddl.u8   q2, q3                        \n"  // R 16 bytes -> 8 shorts.
-      "vld4.8      {d8, d10, d12, d14}, [%1]!    \n"  // load 8 more RGBA pixels.
-      "vld4.8      {d9, d11, d13, d15}, [%1]!    \n"  // load last 8 RGBA pixels.
-      "vpadal.u8   q0, q5                        \n"  // B 16 bytes -> 8 shorts.
-      "vpadal.u8   q1, q6                        \n"  // G 16 bytes -> 8 shorts.
-      "vpadal.u8   q2, q7                        \n"  // R 16 bytes -> 8 shorts.
-
-      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
-      "vrshr.u16   q1, q1, #2                    \n"
-      "vrshr.u16   q2, q2, #2                    \n"
-
-    RGBTOUV(q0, q1, q2)
-      "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
-      "vst1.8      {d1}, [%3]!                   \n"  // store 8 pixels V.
-      "bgt         1b                            \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(src_stride_rgba),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  ARGBToUVMatrixRow_NEON(src_rgba, src_stride_rgba, dst_u, dst_v, width,
+                         &kRgbaI601Constants);
 }

 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
@ -2801,15 +2708,16 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
                            int width,
                            const struct ArgbConstants* c) {
  asm volatile(
-      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
-      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
-      "vdup.8      d20, d16[0]                   \n"  // BY
-      "vdup.8      d21, d16[1]                   \n"  // GY
-      "vdup.8      d22, d16[2]                   \n"  // RY
-      "vdup.16     q12, d18[0]                   \n"  // AY
+      "vld1.8      {d24}, [%3]                   \n"  // load kRGBToY
+      "vld1.16     {d25[0]}, [%4]                \n"  // load kAddY[0]
+      "vdup.8      d20, d24[0]                   \n"  // B
+      "vdup.8      d21, d24[1]                   \n"  // G
+      "vdup.8      d22, d24[2]                   \n"  // R
+      "vdup.8      d23, d24[3]                   \n"  // A
+      "vdup.16     q12, d25[0]                   \n"  // bias
      "1:          \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of ARGB
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
+      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB
+      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 pixels
      "subs        %1, %1, #16                   \n"  // 16 processed per loop.
      "vmull.u8    q8, d0, d20                   \n"  // B
      "vmull.u8    q9, d1, d20                   \n"
@ -2817,6 +2725,8 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
      "vmlal.u8    q9, d3, d21                   \n"
      "vmlal.u8    q8, d4, d22                   \n"  // R
      "vmlal.u8    q9, d5, d22                   \n"
+      "vmlal.u8    q8, d6, d23                   \n"  // A
+      "vmlal.u8    q9, d7, d23                   \n"
      "vaddhn.u16  d0, q8, q12                   \n"  // 16 bit to 8 bit Y
      "vaddhn.u16  d1, q9, q12                   \n"
      "vst1.8      {d0, d1}, [%2]!               \n"  // store 16 pixels Y.
@ -2826,8 +2736,8 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
        "+r"(dst_y)        // %2
      : "r"(&c->kRGBToY),  // %3
        "r"(&c->kAddY)     // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
-        "q12");
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12",
+        "d24", "d25");
 }

 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -2846,52 +2756,20 @@ void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
 }

-// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
-// Same code as ARGB, except the LD4
-static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
-                                  uint8_t* dst_y,
-                                  int width,
-                                  const struct ArgbConstants* c) {
-  asm volatile(
-      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
-      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
-      "vdup.8      d20, d16[0]                   \n"  // BY
-      "vdup.8      d21, d16[1]                   \n"  // GY
-      "vdup.8      d22, d16[2]                   \n"  // RY
-      "vdup.16     q12, d18[0]                   \n"  // AY
-      "1:          \n"
-      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of RGBA
-      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
-      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
-      "vmull.u8    q8, d2, d20                   \n"  // B
-      "vmull.u8    q9, d3, d20                   \n"
-      "vmlal.u8    q8, d4, d21                   \n"  // G
-      "vmlal.u8    q9, d5, d21                   \n"
-      "vmlal.u8    q8, d6, d22                   \n"  // R
-      "vmlal.u8    q9, d7, d22                   \n"
-      "vaddhn.u16  d0, q8, q12                   \n"  // 16 bit to 8 bit Y
-      "vaddhn.u16  d1, q9, q12                   \n"
-      "vst1.8      {d0, d1}, [%1]!               \n"  // store 16 pixels Y.
-      "bgt         1b                            \n"
-      : "+r"(src_rgba),    // %0
-        "+r"(dst_y),       // %1
-        "+r"(width)        // %2
-      : "r"(&c->kRGBToY),  // %3
-        "r"(&c->kAddY)     // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
-        "q12");
-}
-
 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kArgbI601Constants);
+  ARGBToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgbaI601Constants);
 }

 void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
-  RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kArgbJPEGConstants);
+  ARGBToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
 }

 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kAbgrI601Constants);
+  ARGBToYMatrixRow_NEON(src_bgra, dst_y, width, &kBgraI601Constants);
+}
+
+void BGRAToYJRow_NEON(const uint8_t* src_bgra, uint8_t* dst_yj, int width) {
+  ARGBToYMatrixRow_NEON(src_bgra, dst_yj, width, &kBgraJPEGConstants);
 }

 void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
@ -2899,12 +2777,12 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                                 int width,
                                 const struct ArgbConstants* c) {
  asm volatile(
-      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
-      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
-      "vdup.8      d20, d16[0]                   \n"  // BY
-      "vdup.8      d21, d16[1]                   \n"  // GY
-      "vdup.8      d22, d16[2]                   \n"  // RY
-      "vdup.16     q12, d18[0]                   \n"  // AY
+      "vld1.8      {d24}, [%3]                   \n"  // load kRGBToY
+      "vld1.16     {d25[0]}, [%4]                \n"  // load kAddY[0]
+      "vdup.8      d20, d24[0]                   \n"  // BY
+      "vdup.8      d21, d24[1]                   \n"  // GY
+      "vdup.8      d22, d24[2]                   \n"  // RY
+      "vdup.16     q12, d25[0]                   \n"  // AY
      "1:          \n"
      "vld3.8      {d2, d4, d6}, [%0]!           \n"  // load 16 pixels of
                                                      // RGB24.
@ -2925,8 +2803,8 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
        "+r"(width)        // %2
      : "r"(&c->kRGBToY),  // %3
        "r"(&c->kAddY)     // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
-        "q12");
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12",
+        "d24", "d25");
 }


--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -2736,47 +2736,61 @@ struct RgbUVConstants {
 };

 // 8x1 pixels.
-static void ARGBToUV444MatrixRow_NEON(
-    const uint8_t* src_argb,
-    uint8_t* dst_u,
-    uint8_t* dst_v,
-    int width,
-    const struct RgbUVConstants* rgbuvconstants) {
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width,
+                               const struct ArgbConstants* c) {
  asm volatile(
-      "ldr         d0, [%4]                      \n"  // load rgbuvconstants
-      "dup         v24.16b, v0.b[0]              \n"  // UB  0.875 coefficient
-      "dup         v25.16b, v0.b[1]              \n"  // UG -0.5781 coefficient
-      "dup         v26.16b, v0.b[2]              \n"  // UR -0.2969 coefficient
-      "dup         v27.16b, v0.b[4]              \n"  // VB -0.1406 coefficient
-      "dup         v28.16b, v0.b[5]              \n"  // VG -0.7344 coefficient
-      "neg         v24.16b, v24.16b              \n"
-      "movi        v29.8h, #0x80, lsl #8         \n"  // 128.0
-
+      "ldr        q16, [%[c], #16]               \n" // kRGBToU
+      "ldr        q17, [%[c], #32]               \n" // kRGBToV
+      "ldr        s0, [%[c], #64]                \n" // kAddUV
+      "sxtl       v16.8h, v16.8b                 \n" // sign extend U coeffs to 16-bit
+      "sxtl       v17.8h, v17.8b                 \n" // sign extend V coeffs to 16-bit
+      "dup        v20.8h, v16.h[0]               \n" // U0
+      "dup        v21.8h, v16.h[1]               \n" // U1
+      "dup        v22.8h, v16.h[2]               \n" // U2
+      "dup        v23.8h, v16.h[3]               \n" // U3
+      "dup        v24.8h, v17.h[0]               \n" // V0
+      "dup        v26.8h, v17.h[1]               \n" // V1
+      "dup        v27.8h, v17.h[2]               \n" // V2
+      "dup        v28.8h, v17.h[3]               \n" // V3
+      "dup        v25.8h, v0.h[0]                \n" // kAddUV
      "1:          \n"
      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
      "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
-      "umull       v4.8h, v0.8b, v24.8b          \n"  // B
-      "umlsl       v4.8h, v1.8b, v25.8b          \n"  // G
-      "umlsl       v4.8h, v2.8b, v26.8b          \n"  // R
-      "prfm        pldl1keep, [%0, 448]          \n"

-      "umull       v3.8h, v2.8b, v24.8b          \n"  // R
-      "umlsl       v3.8h, v1.8b, v28.8b          \n"  // G
-      "umlsl       v3.8h, v0.8b, v27.8b          \n"  // B
+      "uxtl        v4.8h, v0.8b                  \n"
+      "uxtl        v5.8h, v1.8b                  \n"
+      "uxtl        v6.8h, v2.8b                  \n"
+      "uxtl        v7.8h, v3.8b                  \n"

-      "addhn       v0.8b, v4.8h, v29.8h          \n"  // signed -> unsigned
-      "addhn       v1.8b, v3.8h, v29.8h          \n"
+      // U = B*U0 + G*U1 + R*U2 + A*U3
+      "mul         v18.8h, v4.8h, v20.8h         \n"
+      "mla         v18.8h, v5.8h, v21.8h         \n"
+      "mla         v18.8h, v6.8h, v22.8h         \n"
+      "mla         v18.8h, v7.8h, v23.8h         \n"

-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%2], #8             \n"  // store 8 pixels V.
+      // V = B*V0 + G*V1 + R*V2 + A*V3
+      "mul         v19.8h, v4.8h, v24.8h         \n"
+      "mla         v19.8h, v5.8h, v26.8h         \n"
+      "mla         v19.8h, v6.8h, v27.8h         \n"
+      "mla         v19.8h, v7.8h, v28.8h         \n"
+
+      "subhn       v0.8b, v25.8h, v18.8h         \n"
+      "subhn       v1.8b, v25.8h, v19.8h         \n"
+
+      "st1         {v0.8b}, [%1], #8             \n"
+      "st1         {v1.8b}, [%2], #8             \n"
      "b.gt        1b                            \n"
-      : "+r"(src_argb),      // %0
-        "+r"(dst_u),         // %1
-        "+r"(dst_v),         // %2
-        "+r"(width)          // %3
-      : "r"(rgbuvconstants)  // %4
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
-        "v27", "v28", "v29");
+      : "+r"(src_argb),     // %0
+        "+r"(dst_u),        // %1
+        "+r"(dst_v),        // %2
+        "+r"(width)         // %3
+      : [c] "r"(c)          // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+        "v26", "v27", "v28");
 }

 static void ARGBToUV444MatrixRow_NEON_I8MM(
@ -2784,10 +2798,12 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
    uint8_t* dst_u,
    uint8_t* dst_v,
    int width,
-    const struct RgbUVConstants* rgbuvconstants) {
+    const struct ArgbConstants* c) {
  asm volatile(
-      "ld2r        {v16.4s, v17.4s}, [%[rgbuvconstants]] \n"
-      "movi        v29.8h, #0x80, lsl #8         \n"  // 128.0
+      "ldr         q16, [%[c], #16]              \n" // kRGBToU
+      "ldr         q17, [%[c], #32]              \n" // kRGBToV
+      "ldr         s0, [%[c], #64]               \n" // kAddUV
+      "dup         v29.8h, v0.h[0]               \n" // 128.0
      "1:          \n"
      "ldp         q0, q1, [%[src]], #32         \n"
      "subs        %w[width], %w[width], #8      \n"  // 8 processed per loop.
@ -2807,11 +2823,11 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
      "str         d0, [%[dst_u]], #8            \n"  // store 8 pixels U.
      "str         d1, [%[dst_v]], #8            \n"  // store 8 pixels V.
      "b.gt        1b                            \n"
-      : [src] "+r"(src_argb),                 // %[src]
-        [dst_u] "+r"(dst_u),                  // %[dst_u]
-        [dst_v] "+r"(dst_v),                  // %[dst_v]
-        [width] "+r"(width)                   // %[width]
-      : [rgbuvconstants] "r"(rgbuvconstants)  // %[rgbuvconstants]
+      : [src] "+r"(src_argb),     // %[src]
+        [dst_u] "+r"(dst_u),      // %[dst_u]
+        [dst_v] "+r"(dst_v),      // %[dst_v]
+        [width] "+r"(width)       // %[width]
+      : [c] "r"(c)  // %[c]
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17",
        "v29");
 }
@ -2824,15 +2840,12 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
 // VG -0.7344 coefficient = -94
 // VR   0.875 coefficient = 112

-static const struct RgbUVConstants kARGBI601UVConstants = {{-112, 74, 38, 0},
-                                                           {18, 94, -112, 0}};
-
 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
-                            &kARGBI601UVConstants);
+                            &kArgbI601Constants);
 }

 void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
@ -2840,26 +2853,15 @@ void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
                              uint8_t* dst_v,
                              int width) {
  ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
-                                 &kARGBI601UVConstants);
+                                 &kArgbI601Constants);
 }

-// RGB to JPEG coefficients
-// UB  0.500    coefficient = 128
-// UG -0.33126  coefficient = -85
-// UR -0.16874  coefficient = -43
-// VB -0.08131  coefficient = -21
-// VG -0.41869  coefficient = -107
-// VR 0.500     coefficient = 128
-
-static const struct RgbUVConstants kARGBJPEGUVConstants = {{-128, 85, 43, 0},
-                                                           {21, 107, -128, 0}};
-
 void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
-                            &kARGBJPEGUVConstants);
+                            &kArgbJPEGConstants);
 }

 void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
@ -2867,7 +2869,7 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
                               uint8_t* dst_v,
                               int width) {
  ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
-                                 &kARGBJPEGUVConstants);
+                                 &kArgbJPEGConstants);
 }

 #define RGBTOUV_SETUP_REG                                                  \
@ -2906,12 +2908,14 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
      "ldr        q17, [%[c], #32]               \n" // kRGBToV
      "sxtl       v16.8h, v16.8b                 \n" // sign extend U coeffs to 16-bit
      "sxtl       v17.8h, v17.8b                 \n" // sign extend V coeffs to 16-bit
-      "dup        v20.8h, v16.h[0]               \n" // U0 (-BU)
-      "dup        v21.8h, v16.h[1]               \n" // U1 (-GU)
-      "dup        v22.8h, v16.h[2]               \n" // U2 (-RU)
-      "dup        v23.8h, v17.h[0]               \n" // V0 (-BV)
-      "dup        v24.8h, v17.h[1]               \n" // V1 (-GV)
-      "dup        v26.8h, v17.h[2]               \n" // V2 (-RV)
+      "dup        v20.8h, v16.h[0]               \n" // U0
+      "dup        v21.8h, v16.h[1]               \n" // U1
+      "dup        v22.8h, v16.h[2]               \n" // U2
+      "dup        v23.8h, v16.h[3]               \n" // U3
+      "dup        v24.8h, v17.h[0]               \n" // V0
+      "dup        v26.8h, v17.h[1]               \n" // V1
+      "dup        v27.8h, v17.h[2]               \n" // V2
+      "dup        v28.8h, v17.h[3]               \n" // V3
      "movi       v25.8h, #0x80, lsl #8          \n" // 128.0 in 16-bit (0x8000)

      "1:          \n"
@ -2921,26 +2925,31 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
      "prfm        pldl1keep, [%0, 448]          \n"
      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "uaddlp      v18.8h, v3.16b                \n"  // A 16 bytes -> 8 shorts.

      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%1, 448]          \n"
      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "uadalp      v18.8h, v7.16b                \n"  // A 16 bytes -> 8 shorts.

      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
      "urshr       v1.8h, v1.8h, #2              \n"
      "urshr       v2.8h, v2.8h, #2              \n"
+      "urshr       v18.8h, v18.8h, #2             \n"

-      // U = B*U0 + G*U1 + R*U2
+      // U = B*U0 + G*U1 + R*U2 + A*U3
      "mul        v3.8h, v0.8h, v20.8h          \n"
      "mla        v3.8h, v1.8h, v21.8h          \n"
      "mla        v3.8h, v2.8h, v22.8h          \n"
+      "mla        v3.8h, v18.8h, v23.8h         \n"

-      // V = B*V0 + G*V1 + R*V2
-      "mul        v4.8h, v0.8h, v23.8h          \n"
-      "mla        v4.8h, v1.8h, v24.8h          \n"
-      "mla        v4.8h, v2.8h, v26.8h          \n"
+      // V = B*V0 + G*V1 + R*V2 + A*V3
+      "mul        v4.8h, v0.8h, v24.8h          \n"
+      "mla        v4.8h, v1.8h, v26.8h          \n"
+      "mla        v4.8h, v2.8h, v27.8h          \n"
+      "mla        v4.8h, v18.8h, v28.8h         \n"

      // U = (128.0 - U) >> 8, V = (128.0 - V) >> 8
      "subhn      v0.8b, v25.8h, v3.8h           \n"
@ -2956,7 +2965,8 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
    "+r"(width)        // %4
  : [c] "r"(c)         // %5
  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26"
+    "v16", "v17", "v18", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+    "v27", "v28"
  );
 }

@ -2974,44 +2984,35 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
-  asm volatile (
-      "movi        v20.8h, #128                  \n"  // UB/VR coeff (0.500)
-      "movi        v21.8h, #85                   \n"  // UG coeff (-0.33126)
-      "movi        v22.8h, #43                   \n"  // UR coeff (-0.16874)
-      "movi        v23.8h, #21                   \n"  // VB coeff (-0.08131)
-      "movi        v24.8h, #107                  \n"  // VG coeff (-0.41869)
-      "movi        v25.8h, #0x80, lsl #8         \n"  // 128.0 (0x8000 in 16-bit)
-      "1:          \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
-      "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-      "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
+  ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width,
+                         &kArgbJPEGConstants);
+}

-      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
-      "urshr       v1.8h, v1.8h, #2              \n"
-      "urshr       v2.8h, v2.8h, #2              \n"
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_u, dst_v, width,
+                         &kAbgrI601Constants);
+}

-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  ARGBToUVMatrixRow_NEON(src_bgra, src_stride_bgra, dst_u, dst_v, width,
+                         &kBgraI601Constants);
+}
+
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  ARGBToUVMatrixRow_NEON(src_rgba, src_stride_rgba, dst_u, dst_v, width,
+                         &kRgbaI601Constants);
 }

 void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
@ -3019,44 +3020,8 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
                       uint8_t* dst_uj,
                       uint8_t* dst_vj,
                       int width) {
-  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
-  asm volatile (
-      "movi        v20.8h, #128                  \n"  // UB/VR coeff (0.500)
-      "movi        v21.8h, #85                   \n"  // UG coeff (-0.33126)
-      "movi        v22.8h, #43                   \n"  // UR coeff (-0.16874)
-      "movi        v23.8h, #21                   \n"  // VB coeff (-0.08131)
-      "movi        v24.8h, #107                  \n"  // VG coeff (-0.41869)
-      "movi        v25.8h, #0x80, lsl #8         \n"  // 128.0 (0x8000 in 16-bit)
-      "1:          \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
-      "uaddlp      v0.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v2.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-      "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
-
-      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
-      "urshr       v1.8h, v1.8h, #2              \n"
-      "urshr       v2.8h, v2.8h, #2              \n"
-
-    RGBTOUV(v2.8h, v1.8h, v0.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(src_abgr_1),  // %1
-    "+r"(dst_uj),     // %2
-    "+r"(dst_vj),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
+  ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_uj, dst_vj, width,
+                         &kAbgrJPEGConstants);
 }

 void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
@ -3149,126 +3114,6 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
  );
 }

-void BGRAToUVRow_NEON(const uint8_t* src_bgra,
-                      int src_stride_bgra,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-      "1:          \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
-      "uaddlp      v0.8h, v3.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v3.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v2.8h, v1.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
-      "uadalp      v0.8h, v7.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v3.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v2.8h, v5.16b                 \n"  // R 16 bytes -> 8 shorts.
-
-      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
-      "urshr       v1.8h, v3.8h, #2              \n"
-      "urshr       v2.8h, v2.8h, #2              \n"
-
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(src_bgra_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-void ABGRToUVRow_NEON(const uint8_t* src_abgr,
-                      int src_stride_abgr,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-      "1:          \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
-      "uaddlp      v3.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v2.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v1.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
-      "uadalp      v3.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v2.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v1.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
-
-      "urshr       v0.8h, v3.8h, #2              \n"  // average of 4
-      "urshr       v2.8h, v2.8h, #2              \n"
-      "urshr       v1.8h, v1.8h, #2              \n"
-
-    RGBTOUV(v0.8h, v2.8h, v1.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(src_abgr_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
-void RGBAToUVRow_NEON(const uint8_t* src_rgba,
-                      int src_stride_rgba,
-                      uint8_t* dst_u,
-                      uint8_t* dst_v,
-                      int width) {
-  const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-      "1:          \n"
-      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
-      "uaddlp      v0.8h, v1.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "uaddlp      v1.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v2.8h, v3.16b                 \n"  // R 16 bytes -> 8 shorts.
-      "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
-      "uadalp      v0.8h, v5.16b                 \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]          \n"
-      "uadalp      v1.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v2.8h, v7.16b                 \n"  // R 16 bytes -> 8 shorts.
-
-      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
-      "urshr       v1.8h, v1.8h, #2              \n"
-      "urshr       v2.8h, v2.8h, #2              \n"
-
-    RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(src_rgba_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
-}
-
 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
                       int src_stride_rgb24,
                       uint8_t* dst_u,
@ -3483,18 +3328,19 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
  );
 }

-// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout.
+// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the ArgbConstants layout.
 static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
                                        int src_stride,
                                        uint8_t* dst_u,
                                        uint8_t* dst_v,
                                        int width,
-                                        const int8_t* uvconstants) {
+                                        const struct ArgbConstants* c) {
  const uint8_t* src1 = src + src_stride;
  asm volatile(
      "movi        v23.8h, #0x80, lsl #8           \n"  // 128.0 (0x8000 in
                                                        // 16-bit)
-      "ld2r        {v24.4s, v25.4s}, [%[uvconstants]] \n"
+      "ldr         q24, [%[c], #16]                \n"  // kRGBToU
+      "ldr         q25, [%[c], #32]                \n"  // kRGBToV

      "1:          \n"
      "ld2         {v0.4s, v1.4s}, [%[src]], #32   \n"  // load 8 pixels
@ -3547,51 +3393,19 @@ static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
        [dst_u] "+r"(dst_u),            // %[dst_u]
        [dst_v] "+r"(dst_v),            // %[dst_v]
        [width] "+r"(width)             // %[width]
-      : [uvconstants] "r"(uvconstants)  // %[uvconstants]
+      : [c] "r"(c)                      // %[c]
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v23",
        "v24", "v25");
 }

-// RGB to BT601 coefficients
-// UB   0.875 coefficient = 112
-// UG -0.5781 coefficient = -74
-// UR -0.2969 coefficient = -38
-// VB -0.1406 coefficient = -18
-// VG -0.7344 coefficient = -94
-// VR   0.875 coefficient = 112
-// I8MM constants are stored negated such that we can store 128 in int8_t.
-
-static const int8_t kARGBToUVCoefficients[] = {
-    // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
-    -112, 74, 38, 0, 18, 94, -112, 0,
-};
-
-static const int8_t kABGRToUVCoefficients[] = {
-    // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
-    38, 74, -112, 0, -112, 94, 18, 0,
-};
-
-static const int8_t kBGRAToUVCoefficients[] = {
-    // 0, -UR, -UG, -UB, 0, -VR, -VG, -VB
-    0, 38, 74, -112, 0, -112, 94, 18,
-};
-
-static const int8_t kRGBAToUVCoefficients[] = {
-    // 0, -UB, -UG, -UR, 0, -VB, -VG, -VR
-    0, -112, 74, 38, 0, 18, 94, -112,
-};
-
 void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
                                 int src_stride_argb,
                                 uint8_t* dst_u,
                                 uint8_t* dst_v,
                                 int width,
                                 const struct ArgbConstants* c) {
-  int8_t uvconstants[8] = {
-      (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
-      (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
-                                   uvconstants);
+                                   c);
 }

 void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
@ -3600,7 +3414,7 @@ void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
                           uint8_t* dst_v,
                           int width) {
  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
-                              kARGBToUVCoefficients);
+                              &kArgbI601Constants);
 }

 void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
@ -3609,7 +3423,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
                           uint8_t* dst_v,
                           int width) {
  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
-                              kABGRToUVCoefficients);
+                              &kAbgrI601Constants);
 }

 void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
@ -3618,7 +3432,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
                           uint8_t* dst_v,
                           int width) {
  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width,
-                              kBGRAToUVCoefficients);
+                              &kBgraI601Constants);
 }

 void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
@ -3627,35 +3441,16 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
                           uint8_t* dst_v,
                           int width) {
  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width,
-                              kRGBAToUVCoefficients);
+                              &kRgbaI601Constants);
 }

-// RGB to JPEG coefficients
-// UB  0.500    coefficient = 128
-// UG -0.33126  coefficient = -85
-// UR -0.16874  coefficient = -43
-// VB -0.08131  coefficient = -21
-// VG -0.41869  coefficient = -107
-// VR 0.500     coefficient = 128
-// I8MM constants are stored negated such that we can store 128 in int8_t.
-
-static const int8_t kARGBToUVJCoefficients[] = {
-    // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
-    -128, 85, 43, 0, 21, 107, -128, 0,
-};
-
-static const int8_t kABGRToUVJCoefficients[] = {
-    // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
-    43, 85, -128, 0, -128, 107, 21, 0,
-};
-
 void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
                            int src_stride_argb,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
-                              kARGBToUVJCoefficients);
+                              &kArgbJPEGConstants);
 }

 void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
@ -3664,7 +3459,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
                            uint8_t* dst_v,
                            int width) {
  ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
-                              kABGRToUVJCoefficients);
+                              &kAbgrJPEGConstants);
 }

 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@ -3771,206 +3566,145 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
                                  int width,
                                  const struct ArgbConstants* c) {
  asm volatile(
-      "ldr         s0, [%3]                      \n"  // load rgbconstants
-      "ldr         s1, [%3, #48]                 \n"
-      "dup         v6.16b, v0.b[0]               \n"
-      "dup         v7.16b, v0.b[1]               \n"
-      "dup         v16.16b, v0.b[2]              \n"
-      "dup         v17.8h,  v1.h[0]              \n"
+      "ldr         s16, [%3]                     \n"  // load 4 coeffs
+      "ldr         s17, [%3, #48]                \n"  // load kAddY[0]
+      "dup         v18.16b, v16.b[0]             \n"  // B
+      "dup         v19.16b, v16.b[1]             \n"  // G
+      "dup         v20.16b, v16.b[2]             \n"  // R
+      "dup         v21.16b, v16.b[3]             \n"  // A
+      "dup         v22.8h,  v17.h[0]             \n"  // bias
      "1:          \n"
      "ld4         {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n"  // load 16
-                                                                 // pixels.
      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
-      "umull       v0.8h, v2.8b, v6.8b           \n"  // B
-      "umull2      v1.8h, v2.16b, v6.16b         \n"
+      "umull       v0.8h, v2.8b, v18.8b          \n"  // B
+      "umull2      v1.8h, v2.16b, v18.16b        \n"
      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v0.8h, v3.8b, v7.8b           \n"  // G
-      "umlal2      v1.8h, v3.16b, v7.16b         \n"
-      "umlal       v0.8h, v4.8b, v16.8b          \n"  // R
-      "umlal2      v1.8h, v4.16b, v16.16b        \n"
-      "addhn       v0.8b, v0.8h, v17.8h          \n"  // 16 bit to 8 bit Y
-      "addhn       v1.8b, v1.8h, v17.8h          \n"
+      "umlal       v0.8h, v3.8b, v19.8b          \n"  // G
+      "umlal2      v1.8h, v3.16b, v19.16b        \n"
+      "umlal       v0.8h, v4.8b, v20.8b          \n"  // R
+      "umlal2      v1.8h, v4.16b, v20.16b        \n"
+      "umlal       v0.8h, v5.8b, v21.8b          \n"  // A
+      "umlal2      v1.8h, v5.16b, v21.16b        \n"
+      "addhn       v0.8b, v0.8h, v22.8h          \n"  // 16 bit to 8 bit Y
+      "addhn       v1.8b, v1.8h, v22.8h          \n"
      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
      "b.gt        1b                            \n"
      : "+r"(src_argb),    // %0
        "+r"(dst_y),       // %1
        "+r"(width)        // %2
-      : "r"(c)  // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v17");
+      : "r"(c)             // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
+        "v19", "v20", "v21", "v22");
 }

+
 void ARGBToYMatrixRow_NEON_DotProd(
    const uint8_t* src_argb,
    uint8_t* dst_y,
    int width,
    const struct ArgbConstants* c) {
  asm volatile(
-      "ldr         s0, [%3]                      \n"  // load rgbconstants
-      "ldr         s1, [%3, #48]                 \n"
-      "dup         v16.4s, v0.s[0]               \n"
-      "dup         v17.8h,  v1.h[0]              \n"
+      "ldr         s16, [%3]                     \n"  // load 4 coeffs
+      "ldr         s17, [%3, #48]                \n"  // load kAddY[0]
+      "dup         v18.4s, v16.s[0]              \n"
+      "dup         v19.8h, v17.h[0]              \n"
      "1:          \n"
      "ld1         {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n"  // load 16
-                                                                    // pixels.
      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
      "movi        v0.16b, #0                    \n"
      "movi        v1.16b, #0                    \n"
      "movi        v2.16b, #0                    \n"
      "movi        v3.16b, #0                    \n"
-      "udot        v0.4s, v4.16b, v16.16b        \n"
-      "udot        v1.4s, v5.16b, v16.16b        \n"
-      "udot        v2.4s, v6.16b, v16.16b        \n"
-      "udot        v3.4s, v7.16b, v16.16b        \n"
+      "udot        v0.4s, v4.16b, v18.16b        \n"
+      "udot        v1.4s, v5.16b, v18.16b        \n"
+      "udot        v2.4s, v6.16b, v18.16b        \n"
+      "udot        v3.4s, v7.16b, v18.16b        \n"
      "uzp1        v0.8h, v0.8h, v1.8h           \n"
      "uzp1        v1.8h, v2.8h, v3.8h           \n"
-      "addhn       v0.8b, v0.8h, v17.8h          \n"
-      "addhn       v1.8b, v1.8h, v17.8h          \n"
+      "addhn       v0.8b, v0.8h, v19.8h          \n"
+      "addhn       v1.8b, v1.8h, v19.8h          \n"
      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
      "b.gt        1b                            \n"
      : "+r"(src_argb),    // %0
        "+r"(dst_y),       // %1
        "+r"(width)        // %2
-      : "r"(c)  // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v17");
+      : "r"(c)             // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19");
 }

+
 // RGB to JPeg coefficients
-// B * 0.1140 coefficient = 29
-// G * 0.5870 coefficient = 150
-// R * 0.2990 coefficient = 77
-// Add 0.5
-static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {}, {}, {0x0080}, {}};
-static const struct ArgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, {}, {}, {0x0080}, {}};
-
-static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {}, {}, {0x0080}, {}};
-
-// RGB to BT.601 coefficients
-// B * 0.1016 coefficient = 25
-// G * 0.5078 coefficient = 129
-// R * 0.2578 coefficient = 66
-// Add 16.5 = 0x1080
-
-static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {}, {}, {0x1080}, {}};
-static const struct ArgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66}, {}, {}, {0x1080}, {}};
-
-static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {}, {}, {0x1080}, {}};
-static const struct ArgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25}, {}, {}, {0x1080}, {}};

 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
+  ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kArgbI601Constants);
 }

 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
-  ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+  ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kArgbJPEGConstants);
 }

 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
+  ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kAbgrI601Constants);
 }

 void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
-  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
+  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
 }

 void ARGBToYRow_NEON_DotProd(const uint8_t* src_argb,
                             uint8_t* dst_y,
                             int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kRgb24I601Constants);
+  ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kArgbI601Constants);
 }

 void ARGBToYJRow_NEON_DotProd(const uint8_t* src_argb,
                              uint8_t* dst_yj,
                              int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+  ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kArgbJPEGConstants);
 }

 void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr,
                             uint8_t* dst_y,
                             int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kRawI601Constants);
+  ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kAbgrI601Constants);
 }

 void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr,
                              uint8_t* dst_yj,
                              int width) {
-  ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kRawJPEGConstants);
+  ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
 }

 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
-// Same code as ARGB, except the LD4
-static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
-                                  uint8_t* dst_y,
-                                  int width,
-                                  const struct ArgbConstants* c) {
-  asm volatile(
-      "ldr         s0, [%3]                      \n"  // load rgbconstants
-      "ldr         s1, [%3, #48]                 \n"
-      "dup         v6.16b, v0.b[0]               \n"
-      "dup         v7.16b, v0.b[1]               \n"
-      "dup         v16.16b, v0.b[2]              \n"
-      "dup         v17.8h,  v1.h[0]              \n"
-      "1:          \n"
-      "ld4         {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n"  // load 16
-                                                                 // pixels.
-      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
-      "umull       v0.8h, v2.8b, v6.8b           \n"  // B
-      "umull2      v1.8h, v2.16b, v6.16b         \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v0.8h, v3.8b, v7.8b           \n"  // G
-      "umlal2      v1.8h, v3.16b, v7.16b         \n"
-      "umlal       v0.8h, v4.8b, v16.8b          \n"  // R
-      "umlal2      v1.8h, v4.16b, v16.16b        \n"
-      "addhn       v0.8b, v0.8h, v17.8h          \n"  // 16 bit to 8 bit Y
-      "addhn       v1.8b, v1.8h, v17.8h          \n"
-      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
-      "b.gt        1b                            \n"
-      : "+r"(src_rgba),    // %0
-        "+r"(dst_y),       // %1
-        "+r"(width)        // %2
-      : "r"(c)  // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-        "v17");
-}

 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
+  ARGBToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgbaI601Constants);
 }

 void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
-  RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+  ARGBToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
 }

 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
+  ARGBToYMatrixRow_NEON(src_bgra, dst_y, width, &kBgraI601Constants);
 }

 void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba,
                             uint8_t* dst_y,
                             int width) {
-  // No need for a separate implementation for RGBA inputs, just permute the
-  // RGB constants.
-  ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width,
-                                &kRgb24I601DotProdConstants);
+  ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width, &kRgbaI601Constants);
 }

 void RGBAToYJRow_NEON_DotProd(const uint8_t* src_rgba,
                              uint8_t* dst_yj,
                              int width) {
-  // No need for a separate implementation for RGBA inputs, just permute the
-  // RGB constants.
-  ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width,
-                                &kRgb24JPEGDotProdConstants);
+  ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
 }

 void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra,
                             uint8_t* dst_y,
                             int width) {
-  // No need for a separate implementation for RGBA inputs, just permute the
-  // RGB constants.
-  ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width,
-                                &kRawI601DotProdConstants);
+  ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width, &kBgraI601Constants);
 }

 void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
@ -3978,30 +3712,32 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                                 int width,
                                 const struct ArgbConstants* c) {
  asm volatile(
-      "ldr         d0, [%3]                      \n"  // load rgbconstants
-      "dup         v5.16b, v0.b[0]               \n"
-      "dup         v6.16b, v0.b[1]               \n"
-      "dup         v7.16b, v0.b[2]               \n"
-      "dup         v16.8h,  v0.h[2]              \n"
+      "ldr         s16, [%3]                     \n"  // load 4 coeffs
+      "ldr         s17, [%3, #48]                \n"  // load kAddY[0]
+      "dup         v18.16b, v16.b[0]             \n"  // B
+      "dup         v19.16b, v16.b[1]             \n"  // G
+      "dup         v20.16b, v16.b[2]             \n"  // R
+      "dup         v21.8h,  v17.h[0]             \n"  // bias
      "1:          \n"
      "ld3         {v2.16b,v3.16b,v4.16b}, [%0], #48 \n"  // load 16 pixels.
      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
-      "umull       v0.8h, v2.8b, v5.8b           \n"  // B
-      "umull2      v1.8h, v2.16b, v5.16b         \n"
+      "umull       v0.8h, v2.8b, v18.8b          \n"  // B
+      "umull2      v1.8h, v2.16b, v18.16b        \n"
      "prfm        pldl1keep, [%0, 448]          \n"
-      "umlal       v0.8h, v3.8b, v6.8b           \n"  // G
-      "umlal2      v1.8h, v3.16b, v6.16b         \n"
-      "umlal       v0.8h, v4.8b, v7.8b           \n"  // R
-      "umlal2      v1.8h, v4.16b, v7.16b         \n"
-      "addhn       v0.8b, v0.8h, v16.8h          \n"  // 16 bit to 8 bit Y
-      "addhn       v1.8b, v1.8h, v16.8h          \n"
+      "umlal       v0.8h, v3.8b, v19.8b          \n"  // G
+      "umlal2      v1.8h, v3.16b, v19.16b        \n"
+      "umlal       v0.8h, v4.8b, v20.8b          \n"  // R
+      "umlal2      v1.8h, v4.16b, v20.16b        \n"
+      "addhn       v0.8b, v0.8h, v21.8h          \n"  // 16 bit to 8 bit Y
+      "addhn       v1.8b, v1.8h, v21.8h          \n"
      "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
      "b.gt        1b                            \n"
      : "+r"(src_rgb),     // %0
        "+r"(dst_y),       // %1
        "+r"(width)        // %2
      : "r"(c)  // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18",
+        "v19", "v20", "v21");
 }