From 1170363ce55fec2a256ce383479d8a6a3edadffe Mon Sep 17 00:00:00 2001
From: Dale Curtis <dalecurtis@chromium.org>
Date: Thu, 19 Mar 2026 23:39:57 +0000
Subject: [PATCH] Add Gemini implementation for NEON32 RGB to YUV matrix
 operations

These are about 25% faster than the C versions.

Bug: libyuv:42280902

Change-Id: I8b298670ee5f3ed5db35527fc41d6d9a51b020a1
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7573682
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Dale Curtis <dalecurtis@chromium.org>
---
 include/libyuv/row.h        |  49 +++++++--
 source/convert.cc           |   8 ++
 source/convert_from_argb.cc |  32 ++++++
 source/row_any.cc           |  24 +++++
 source/row_common.cc        |  10 ++
 source/row_neon.cc          | 205 ++++++++++++++----------------------
 6 files changed, 196 insertions(+), 132 deletions(-)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 034ff866e..db875b74f 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -406,11 +406,17 @@ extern "C" {
 #define HAS_ARGBTORGB24ROW_NEON
 #define HAS_ARGBTORGB565DITHERROW_NEON
 #define HAS_ARGBTORGB565ROW_NEON
+#if !defined(__aarch64__)
+#define HAS_ARGBTOUV444MATRIXROW_NEON
+#endif
 #define HAS_ARGBTOUV444ROW_NEON
 #define HAS_ARGBTOUVJ444ROW_NEON
 #define HAS_ARGBTOUVJROW_NEON
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
+#if !defined(__aarch64__)
+#define HAS_ARGBTOYMATRIXROW_NEON
+#endif
 #define HAS_ARGBTOYROW_NEON
 #define HAS_AYUVTOUVROW_NEON
 #define HAS_AYUVTOVUROW_NEON
@@ -975,20 +981,19 @@ typedef uint32_t ulvec32[8];
 typedef uint8_t ulvec8[32];
 #endif
 
-struct ArgbConstants {
-  uint8_t kRGBToY[32];
-  int8_t kRGBToU[32];
-  int8_t kRGBToV[32];
-  uint16_t kAddY[16];
-  uint16_t kAddUV[16];
-};
-
 #if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
 // This struct is for ARM and RISC-V color conversion.
 struct YuvConstants {
   uvec8 kUVCoeff;
   vec16 kRGBCoeffBias;
 };
+struct ArgbConstants {
+  uvec8 kRGBToY;
+  vec8 kRGBToU;
+  vec8 kRGBToV;
+  uvec16 kAddY;
+  uvec16 kAddUV;
+};
 #else
 // This struct is for Intel color conversion.
 struct YuvConstants {
@@ -998,6 +1003,13 @@ struct YuvConstants {
   int16_t kYToRgb[16];
   int16_t kYBiasToRgb[16];
 };
+struct ArgbConstants {
+  uint8_t kRGBToY[32];
+  int8_t kRGBToU[32];
+  int8_t kRGBToV[32];
+  uint16_t kAddY[16];
+  uint16_t kAddUV[16];
+};
 
 // Offsets into YuvConstants structure
 #define KUVTOB 0
@@ -1778,6 +1790,27 @@ void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
 void ARGBToYJRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
 void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
 void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
+
+#if !defined(__aarch64__)
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width,
+                               const struct ArgbConstants* c);
+void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_y,
+                            int width,
+                            const struct ArgbConstants* c);
+void ARGBToUV444MatrixRow_Any_NEON(const uint8_t* src_argb,
+                                   uint8_t* dst_u,
+                                   uint8_t* dst_v,
+                                   int width,
+                                   const struct ArgbConstants* c);
+void ARGBToYMatrixRow_Any_NEON(const uint8_t* src_argb,
+                               uint8_t* dst_y,
+                               int width,
+                               const struct ArgbConstants* c);
+#endif
 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
diff --git a/source/convert.cc b/source/convert.cc
index e01442316..cddaf961b 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -2178,6 +2178,14 @@ int ARGBToI420Matrix(const uint8_t* src_argb,
       ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
     }
   }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+    }
+  }
 #endif
   if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 ||
       height == 0) {
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index 8f6483a02..c7bf41ea8 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -199,6 +199,22 @@ int ARGBToI444Matrix(const uint8_t* src_argb,
       ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_AVX2;
     }
   }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444MATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_NEON;
+    }
+  }
 #endif
   if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 ||
       height == 0) {
@@ -415,6 +431,14 @@ int ARGBToI422Matrix(const uint8_t* src_argb,
       ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
     }
   }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+    }
+  }
 #endif
   if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 ||
       height == 0) {
@@ -677,6 +701,14 @@ int ARGBToNV12Matrix(const uint8_t* src_argb,
       ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
     }
   }
+#endif
+#if defined(HAS_ARGBTOYMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
+    }
+  }
 #endif
   void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
                      uint8_t* dst_uv, int width) = MergeUVRow_C;
diff --git a/source/row_any.cc b/source/row_any.cc
index f44bcfb5c..f34f3eb2e 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -2253,6 +2253,30 @@ ANY12M(ARGBToUV444MatrixRow_Any_AVX2, ARGBToUV444MatrixRow_AVX2, 4, 31)
 #ifdef HAS_ARGBTOUV444MATRIXROW_SSSE3
 ANY12M(ARGBToUV444MatrixRow_Any_SSSE3, ARGBToUV444MatrixRow_SSSE3, 4, 15)
 #endif
+#ifdef HAS_ARGBTOUV444MATRIXROW_NEON
+ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7)
+#endif
+
+#define ANY11MC(NAMEANY, ANY_SIMD, BPP, MASK)                                \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width,          \
+               const struct ArgbConstants* c) {                              \
+    SIMD_ALIGNED(uint8_t vin[128]);                                          \
+    SIMD_ALIGNED(uint8_t vout[128]);                                         \
+    memset(vin, 0, sizeof(vin)); /* for msan */                              \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, dst_ptr, n, c);                                      \
+    }                                                                        \
+    memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP);           \
+    ANY_SIMD(vin, vout, MASK + 1, c);                                        \
+    memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r);                      \
+  }
+
+#ifdef HAS_ARGBTOYMATRIXROW_NEON
+ANY11MC(ARGBToYMatrixRow_Any_NEON, ARGBToYMatrixRow_NEON, 4, 15)
+#endif
+#undef ANY11MC
 
 #ifdef HAS_ARGBTOUVROW_AVX2
 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
diff --git a/source/row_common.cc b/source/row_common.cc
index a9969d808..8b192a539 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1486,6 +1486,15 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
    {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
 #endif
 
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+#define ARGBCONSTANTSBODY(Y0, Y1, Y2, Y3, U0, U1, U2, U3, V0, V1, V2, V3, AY, \
+                          AUV)                                                \
+  {{Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3},          \
+   {U0, U1, U2, U3, U0, U1, U2, U3, U0, U1, U2, U3, U0, U1, U2, U3},          \
+   {V0, V1, V2, V3, V0, V1, V2, V3, V0, V1, V2, V3, V0, V1, V2, V3},          \
+   {AY, AY, AY, AY, AY, AY, AY, AY},                                          \
+   {AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV}}
+#else
 #define ARGBCONSTANTSBODY(Y0, Y1, Y2, Y3, U0, U1, U2, U3, V0, V1, V2, V3, AY, \
                           AUV)                                                \
   {{Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3,           \
@@ -1497,6 +1506,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
    {AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY},          \
    {AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV,     \
     AUV, AUV}}
+#endif
 
 // clang-format on
 
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 1f1a3bbf3..689412668 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -9,6 +9,7 @@
  */
 
 #include "libyuv/row.h"
+#include "libyuv/convert_from_argb.h"  // For ArgbConstants
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -1840,39 +1841,36 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
   );
 }
 
-// Coefficients expressed as negatives to allow 128
-struct RgbUVConstants {
-  int8_t kRGBToU[4];
-  int8_t kRGBToV[4];
-};
-
 // 8x1 pixels.
-static void ARGBToUV444MatrixRow_NEON(
-    const uint8_t* src_argb,
-    uint8_t* dst_u,
-    uint8_t* dst_v,
-    int width,
-    const struct RgbUVConstants* rgbuvconstants) {
+void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width,
+                               const struct ArgbConstants* c) {
   asm volatile(
-      "vld1.8      {d0}, [%4]                    \n"  // load rgbuvconstants
-      "vdup.u8     d24, d0[0]                    \n"  // UB  0.875  coefficient
-      "vdup.u8     d25, d0[1]                    \n"  // UG -0.5781 coefficient
-      "vdup.u8     d26, d0[2]                    \n"  // UR -0.2969 coefficient
-      "vdup.u8     d27, d0[4]                    \n"  // VB -0.1406 coefficient
-      "vdup.u8     d28, d0[5]                    \n"  // VG -0.7344 coefficient
-      "vneg.s8     d24, d24                      \n"
-      "vmov.u16    q15, #0x8000                  \n"  // 128.0
+      "vld1.8      {d16}, [%4]                   \n"  // load kRGBToU
+      "vld1.8      {d17}, [%5]                   \n"  // load kRGBToV
+      "vld1.16     {d18[0]}, [%6]                \n"  // load kAddUV[0]
+      "vabs.s8     d16, d16                      \n"  // BU, GU, RU
+      "vabs.s8     d17, d17                      \n"  // BV, GV, RV
+      "vdup.8      d20, d16[0]                   \n"  // BU
+      "vdup.8      d21, d16[1]                   \n"  // GU
+      "vdup.8      d22, d16[2]                   \n"  // RU
+      "vdup.8      d23, d17[0]                   \n"  // BV
+      "vdup.8      d24, d17[1]                   \n"  // GV
+      "vdup.8      d25, d17[2]                   \n"  // RV
+      "vdup.16     q15, d18[0]                   \n"  // kAddUV
 
       "1:          \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
       "subs        %3, %3, #8                    \n"  // 8 processed per loop.
-      "vmull.u8    q2, d0, d24                   \n"  // B
-      "vmlsl.u8    q2, d1, d25                   \n"  // G
-      "vmlsl.u8    q2, d2, d26                   \n"  // R
+      "vmull.u8    q2, d0, d20                   \n"  // B * BU
+      "vmlsl.u8    q2, d1, d21                   \n"  // - G * GU
+      "vmlsl.u8    q2, d2, d22                   \n"  // - R * RU
 
-      "vmull.u8    q3, d2, d24                   \n"  // R
-      "vmlsl.u8    q3, d1, d28                   \n"  // G
-      "vmlsl.u8    q3, d0, d27                   \n"  // B
+      "vmull.u8    q3, d2, d25                   \n"  // R * RV
+      "vmlsl.u8    q3, d1, d24                   \n"  // - G * GV
+      "vmlsl.u8    q3, d0, d23                   \n"  // - B * BV
 
       "vaddhn.u16  d0, q2, q15                   \n"  // signed -> unsigned
       "vaddhn.u16  d1, q3, q15                   \n"
@@ -1880,53 +1878,32 @@ static void ARGBToUV444MatrixRow_NEON(
       "vst1.8      {d0}, [%1]!                   \n"  // store 8 pixels U.
       "vst1.8      {d1}, [%2]!                   \n"  // store 8 pixels V.
       "bgt         1b                            \n"
-      : "+r"(src_argb),      // %0
-        "+r"(dst_u),         // %1
-        "+r"(dst_v),         // %2
-        "+r"(width)          // %3
-      : "r"(rgbuvconstants)  // %4
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
-        "q15");
+      : "+r"(src_argb),     // %0
+        "+r"(dst_u),        // %1
+        "+r"(dst_v),        // %2
+        "+r"(width)         // %3
+      : "r"(&c->kRGBToU),   // %4
+        "r"(&c->kRGBToV),   // %5
+        "r"(&c->kAddUV)     // %6
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
 }
 
-// RGB to BT601 coefficients
-// UB   0.875 coefficient = 112
-// UG -0.5781 coefficient = -74
-// UR -0.2969 coefficient = -38
-// VB -0.1406 coefficient = -18
-// VG -0.7344 coefficient = -94
-// VR   0.875 coefficient = 112
-
-static const struct RgbUVConstants kARGBI601UVConstants = {{-112, 74, 38, 0},
-                                                           {18, 94, -112, 0}};
-
 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
-                            &kARGBI601UVConstants);
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbI601Constants);
 }
 
-// RGB to JPEG coefficients
-// UB  0.500    coefficient = 128
-// UG -0.33126  coefficient = -85
-// UR -0.16874  coefficient = -43
-// VB -0.08131  coefficient = -21
-// VG -0.41869  coefficient = -107
-// VR 0.500     coefficient = 128
-
-static const struct RgbUVConstants kARGBJPEGUVConstants = {{-128, 85, 43, 0},
-                                                           {21, 107, -128, 0}};
-
 void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
-                            &kARGBJPEGUVConstants);
+  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbJPEGConstants);
 }
 
+
 // clang-format off
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
 #define RGBTOUV(QB, QG, QR)                                                 \
@@ -2754,47 +2731,22 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
       : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
 }
 
-struct RgbConstants {
-  uint8_t kRGBToY[4];
-  uint16_t kAddY;
-};
-
-// RGB to JPeg coefficients
-// B * 0.1140 coefficient = 29
-// G * 0.5870 coefficient = 150
-// R * 0.2990 coefficient = 77
-// Add 0.5
-static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
-                                                        0x0080};
-
-static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 0x0080};
-
-// RGB to BT.601 coefficients
-// B * 0.1016 coefficient = 25
-// G * 0.5078 coefficient = 129
-// R * 0.2578 coefficient = 66
-// Add 16.5 = 0x1080
-
-static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
-                                                        0x1080};
-
-static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
-
 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
-static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
-                                  uint8_t* dst_y,
-                                  int width,
-                                  const struct RgbConstants* rgbconstants) {
+void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_y,
+                            int width,
+                            const struct ArgbConstants* c) {
   asm volatile(
-      "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
-      "vdup.u8     d20, d0[0]                    \n"
-      "vdup.u8     d21, d0[1]                    \n"
-      "vdup.u8     d22, d0[2]                    \n"
-      "vdup.u16    q12, d0[2]                    \n"
+      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
+      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
+      "vdup.8      d20, d16[0]                   \n"  // BY
+      "vdup.8      d21, d16[1]                   \n"  // GY
+      "vdup.8      d22, d16[2]                   \n"  // RY
+      "vdup.16     q12, d18[0]                   \n"  // AY
       "1:          \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of ARGB
       "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
-      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
+      "subs        %1, %1, #16                   \n"  // 16 processed per loop.
       "vmull.u8    q8, d0, d20                   \n"  // B
       "vmull.u8    q9, d1, d20                   \n"
       "vmlal.u8    q8, d2, d21                   \n"  // G
@@ -2803,30 +2755,31 @@ static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
       "vmlal.u8    q9, d5, d22                   \n"
       "vaddhn.u16  d0, q8, q12                   \n"  // 16 bit to 8 bit Y
       "vaddhn.u16  d1, q9, q12                   \n"
-      "vst1.8      {d0, d1}, [%1]!               \n"  // store 16 pixels Y.
+      "vst1.8      {d0, d1}, [%2]!               \n"  // store 16 pixels Y.
       "bgt         1b                            \n"
       : "+r"(src_argb),    // %0
-        "+r"(dst_y),       // %1
-        "+r"(width)        // %2
-      : "r"(rgbconstants)  // %3
+        "+r"(width),       // %1
+        "+r"(dst_y)        // %2
+      : "r"(&c->kRGBToY),  // %3
+        "r"(&c->kAddY)     // %4
       : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
         "q12");
 }
 
 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
+  ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kArgbI601Constants);
 }
 
 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
-  ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+  ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kArgbJPEGConstants);
 }
 
 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
+  ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kAbgrI601Constants);
 }
 
 void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
-  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
+  ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
 }
 
 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
@@ -2834,13 +2787,14 @@ void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
 static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
                                   uint8_t* dst_y,
                                   int width,
-                                  const struct RgbConstants* rgbconstants) {
+                                  const struct ArgbConstants* c) {
   asm volatile(
-      "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
-      "vdup.u8     d20, d0[0]                    \n"
-      "vdup.u8     d21, d0[1]                    \n"
-      "vdup.u8     d22, d0[2]                    \n"
-      "vdup.u16    q12, d0[2]                    \n"
+      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
+      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
+      "vdup.8      d20, d16[0]                   \n"  // BY
+      "vdup.8      d21, d16[1]                   \n"  // GY
+      "vdup.8      d22, d16[2]                   \n"  // RY
+      "vdup.16     q12, d18[0]                   \n"  // AY
       "1:          \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of RGBA
       "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
@@ -2858,33 +2812,35 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
       : "+r"(src_rgba),    // %0
         "+r"(dst_y),       // %1
         "+r"(width)        // %2
-      : "r"(rgbconstants)  // %3
+      : "r"(&c->kRGBToY),  // %3
+        "r"(&c->kAddY)     // %4
       : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
         "q12");
 }
 
 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
+  RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kArgbI601Constants);
 }
 
 void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
-  RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+  RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kArgbJPEGConstants);
 }
 
 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
+  RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kAbgrI601Constants);
 }
 
 static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                                  uint8_t* dst_y,
                                  int width,
-                                 const struct RgbConstants* rgbconstants) {
+                                 const struct ArgbConstants* c) {
   asm volatile(
-      "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
-      "vdup.u8     d20, d0[0]                    \n"
-      "vdup.u8     d21, d0[1]                    \n"
-      "vdup.u8     d22, d0[2]                    \n"
-      "vdup.u16    q12, d0[2]                    \n"
+      "vld1.8      {d16}, [%3]                   \n"  // load kRGBToY
+      "vld1.16     {d18[0]}, [%4]                \n"  // load kAddY[0]
+      "vdup.8      d20, d16[0]                   \n"  // BY
+      "vdup.8      d21, d16[1]                   \n"  // GY
+      "vdup.8      d22, d16[2]                   \n"  // RY
+      "vdup.16     q12, d18[0]                   \n"  // AY
       "1:          \n"
       "vld3.8      {d2, d4, d6}, [%0]!           \n"  // load 16 pixels of
                                                       // RGB24.
@@ -2903,25 +2859,26 @@ static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
       : "+r"(src_rgb),     // %0
         "+r"(dst_y),       // %1
         "+r"(width)        // %2
-      : "r"(rgbconstants)  // %3
+      : "r"(&c->kRGBToY),  // %3
+        "r"(&c->kAddY)     // %4
       : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
         "q12");
 }
 
 void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
-  RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+  RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kArgbJPEGConstants);
 }
 
 void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
-  RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
+  RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kAbgrJPEGConstants);
 }
 
 void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
-  RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
+  RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kArgbI601Constants);
 }
 
 void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
-  RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
+  RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kAbgrI601Constants);
 }
 
 // Bilinear filter 16x2 -> 16x1