diff --git a/README.chromium b/README.chromium
index b44f26f62..017031390 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1903
+Version: 1904
 License: BSD
 License File: LICENSE
 Shipped: yes
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 84f35c4d0..e26e427d0 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1903
+#define LIBYUV_VERSION 1904
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/row_common.cc b/source/row_common.cc
index 5182a1d8d..5e1551b99 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -662,15 +662,13 @@ static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
 }
 #endif
 
-// LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb.
+// ARM uses uint16
 #if !defined(LIBYUV_ARGBTOUV_PAVGB)
-static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
-  return STATIC_CAST(
-      uint8_t, ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8);
+static __inline int RGBxToU(uint16_t r, uint16_t g, uint16_t b) {
+  return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8);
 }
-static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
-  return STATIC_CAST(
-      uint8_t, ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8);
+static __inline int RGBxToV(uint16_t r, uint16_t g, uint16_t b) {
+  return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8080) >> 8);
 }
 #endif
 
@@ -713,7 +711,7 @@ static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
     }                                                                      \
   }
 #else
-// ARM version does sum / 2 then multiply by 2x smaller coefficients
+// ARM version does average of 4 pixels with rounding
 #define MAKEROWY(NAME, R, G, B, BPP)                                       \
   void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
     int x;                                                                 \
@@ -729,27 +727,27 @@ static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
     int x;                                                                 \
     for (x = 0; x < width - 1; x += 2) {                                   \
       uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +         \
-                     src_rgb1[B + BPP] + 1) >>                             \
-                    1;                                                     \
+                     src_rgb1[B + BPP] + 2) >>                             \
+                    2;                                                     \
       uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +         \
-                     src_rgb1[G + BPP] + 1) >>                             \
-                    1;                                                     \
+                     src_rgb1[G + BPP] + 2) >>                             \
+                    2;                                                     \
       uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +         \
-                     src_rgb1[R + BPP] + 1) >>                             \
-                    1;                                                     \
-      dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
-      dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
+                     src_rgb1[R + BPP] + 2) >>                             \
+                    2;                                                     \
+      dst_u[0] = RGBxToU(ar, ag, ab);                                      \
+      dst_v[0] = RGBxToV(ar, ag, ab);                                      \
       src_rgb += BPP * 2;                                                  \
       src_rgb1 += BPP * 2;                                                 \
       dst_u += 1;                                                          \
       dst_v += 1;                                                          \
     }                                                                      \
     if (width & 1) {                                                       \
-      uint16_t ab = src_rgb[B] + src_rgb1[B];                              \
-      uint16_t ag = src_rgb[G] + src_rgb1[G];                              \
-      uint16_t ar = src_rgb[R] + src_rgb1[R];                              \
-      dst_u[0] = RGB2xToU(ar, ag, ab);                                     \
-      dst_v[0] = RGB2xToV(ar, ag, ab);                                     \
+      uint16_t ab = (src_rgb[B] + src_rgb1[B] + 1) >> 1;                   \
+      uint16_t ag = (src_rgb[G] + src_rgb1[G] + 1) >> 1;                   \
+      uint16_t ar = (src_rgb[R] + src_rgb1[R] + 1) >> 1;                   \
+      dst_u[0] = RGBxToU(ar, ag, ab);                                      \
+      dst_v[0] = RGBxToV(ar, ag, ab);                                      \
     }                                                                      \
   }
 #endif
@@ -806,11 +804,11 @@ static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
 }
 #if !defined(LIBYUV_ARGBTOUV_PAVGB)
-static __inline uint8_t RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
-  return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
+static __inline uint8_t RGBxToUJ(uint16_t r, uint16_t g, uint16_t b) {
+  return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
 }
-static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
-  return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
+static __inline uint8_t RGBxToVJ(uint16_t r, uint16_t g, uint16_t b) {
+  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
 }
 #endif
 
@@ -853,7 +851,7 @@ static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
     }                                                                       \
   }
 #else
-// ARM version does sum / 2 then multiply by 2x smaller coefficients
+// ARM version does average of 4 pixels with rounding
 #define MAKEROWYJ(NAME, R, G, B, BPP)                                       \
   void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
     int x;                                                                  \
@@ -869,27 +867,27 @@ static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
     int x;                                                                  \
     for (x = 0; x < width - 1; x += 2) {                                    \
       uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] +          \
-                     src_rgb1[B + BPP] + 1) >>                              \
-                    1;                                                      \
+                     src_rgb1[B + BPP] + 2) >>                              \
+                    2;                                                      \
       uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] +          \
-                     src_rgb1[G + BPP] + 1) >>                              \
-                    1;                                                      \
+                     src_rgb1[G + BPP] + 2) >>                              \
+                    2;                                                      \
       uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] +          \
-                     src_rgb1[R + BPP] + 1) >>                              \
-                    1;                                                      \
-      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
-      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
+                     src_rgb1[R + BPP] + 2) >>                              \
+                    2;                                                      \
+      dst_u[0] = RGBxToUJ(ar, ag, ab);                                      \
+      dst_v[0] = RGBxToVJ(ar, ag, ab);                                      \
       src_rgb += BPP * 2;                                                   \
       src_rgb1 += BPP * 2;                                                  \
       dst_u += 1;                                                           \
       dst_v += 1;                                                           \
     }                                                                       \
     if (width & 1) {                                                        \
-      uint16_t ab = (src_rgb[B] + src_rgb1[B]);                             \
-      uint16_t ag = (src_rgb[G] + src_rgb1[G]);                             \
-      uint16_t ar = (src_rgb[R] + src_rgb1[R]);                             \
-      dst_u[0] = RGB2xToUJ(ar, ag, ab);                                     \
-      dst_v[0] = RGB2xToVJ(ar, ag, ab);                                     \
+      uint16_t ab = (src_rgb[B] + src_rgb1[B] + 1) >> 1;                    \
+      uint16_t ag = (src_rgb[G] + src_rgb1[G] + 1) >> 1;                    \
+      uint16_t ar = (src_rgb[R] + src_rgb1[R] + 1) >> 1;                    \
+      dst_u[0] = RGBxToUJ(ar, ag, ab);                                      \
+      dst_v[0] = RGBxToVJ(ar, ag, ab);                                      \
     }                                                                       \
   }
 
@@ -994,11 +992,11 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
     dst_u[0] = RGBToU(ar, ag, ab);
     dst_v[0] = RGBToV(ar, ag, ab);
 #else
-    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
-    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
-    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
-    dst_u[0] = RGB2xToU(r, g, b);
-    dst_v[0] = RGB2xToV(r, g, b);
+    uint16_t b = (b0 + b1 + b2 + b3 + 2) >> 2;
+    uint16_t g = (g0 + g1 + g2 + g3 + 2) >> 2;
+    uint16_t r = (r0 + r1 + r2 + r3 + 2) >> 2;
+    dst_u[0] = RGBxToU(r, g, b);
+    dst_v[0] = RGBxToV(r, g, b);
 #endif
 
     src_rgb565 += 4;
@@ -1029,11 +1027,11 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
     dst_u[0] = RGBToU(ar, ag, ab);
     dst_v[0] = RGBToV(ar, ag, ab);
 #else
-    uint16_t b = b0 + b2;
-    uint16_t g = g0 + g2;
-    uint16_t r = r0 + r2;
-    dst_u[0] = RGB2xToU(r, g, b);
-    dst_v[0] = RGB2xToV(r, g, b);
+    uint16_t b = (b0 + b2 + 1) >> 1;
+    uint16_t g = (g0 + g2 + 1) >> 1;
+    uint16_t r = (r0 + r2 + 1) >> 1;
+    dst_u[0] = RGBxToU(r, g, b);
+    dst_v[0] = RGBxToV(r, g, b);
 #endif
   }
 }
@@ -1083,11 +1081,11 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
     dst_u[0] = RGBToU(ar, ag, ab);
     dst_v[0] = RGBToV(ar, ag, ab);
 #else
-    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
-    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
-    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
-    dst_u[0] = RGB2xToU(r, g, b);
-    dst_v[0] = RGB2xToV(r, g, b);
+    uint16_t b = (b0 + b1 + b2 + b3 + 2) >> 2;
+    uint16_t g = (g0 + g1 + g2 + g3 + 2) >> 2;
+    uint16_t r = (r0 + r1 + r2 + r3 + 2) >> 2;
+    dst_u[0] = RGBxToU(r, g, b);
+    dst_v[0] = RGBxToV(r, g, b);
 #endif
 
     src_argb1555 += 4;
@@ -1119,11 +1117,11 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
     dst_u[0] = RGBToU(ar, ag, ab);
     dst_v[0] = RGBToV(ar, ag, ab);
 #else
-    uint16_t b = b0 + b2;
-    uint16_t g = g0 + g2;
-    uint16_t r = r0 + r2;
-    dst_u[0] = RGB2xToU(r, g, b);
-    dst_v[0] = RGB2xToV(r, g, b);
+    uint16_t b = (b0 + b2 + 1) >> 1;
+    uint16_t g = (g0 + g2 + 1) >> 1;
+    uint16_t r = (r0 + r2 + 1) >> 1;
+    dst_u[0] = RGBxToU(r, g, b);
+    dst_v[0] = RGBxToV(r, g, b);
 #endif
   }
 }
@@ -1169,11 +1167,11 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
     dst_u[0] = RGBToU(ar, ag, ab);
     dst_v[0] = RGBToV(ar, ag, ab);
 #else
-    uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
-    uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
-    uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
-    dst_u[0] = RGB2xToU(r, g, b);
-    dst_v[0] = RGB2xToV(r, g, b);
+    uint16_t b = (b0 + b1 + b2 + b3 + 2) >> 2;
+    uint16_t g = (g0 + g1 + g2 + g3 + 2) >> 2;
+    uint16_t r = (r0 + r1 + r2 + r3 + 2) >> 2;
+    dst_u[0] = RGBxToU(r, g, b);
+    dst_v[0] = RGBxToV(r, g, b);
 #endif
 
     src_argb4444 += 4;
@@ -1203,11 +1201,11 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
     dst_u[0] = RGBToU(ar, ag, ab);
     dst_v[0] = RGBToV(ar, ag, ab);
 #else
-    uint16_t b = b0 + b2;
-    uint16_t g = g0 + g2;
-    uint16_t r = r0 + r2;
-    dst_u[0] = RGB2xToU(r, g, b);
-    dst_v[0] = RGB2xToV(r, g, b);
+    uint16_t b = (b0 + b2 + 1) >> 1;
+    uint16_t g = (g0 + g2 + 1) >> 1;
+    uint16_t r = (r0 + r2 + 1) >> 1;
+    dst_u[0] = RGBxToU(r, g, b);
+    dst_v[0] = RGBxToV(r, g, b);
 #endif
   }
 }
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 49d7584dc..16ad3a936 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -1933,11 +1933,11 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
                       int width) {
   asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_argb
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
@@ -1952,9 +1952,9 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
       "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
       "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
 
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
+      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q1, #2                    \n"
+      "vrshr.u16   q2, q2, #2                    \n"
 
     RGBTOUV(q0, q1, q2)
       "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
@@ -1971,7 +1971,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
   );
 }
 
-// TODO(fbarchard): Subsample match Intel code.
 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
@@ -1979,11 +1978,11 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        int width) {
   asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_argb
-      "vmov.s16    q10, #127 / 2                 \n"  // UB / VR 0.500 coefficient
-      "vmov.s16    q11, #84 / 2                  \n"  // UG -0.33126 coefficient
-      "vmov.s16    q12, #43 / 2                  \n"  // UR -0.16874 coefficient
-      "vmov.s16    q13, #20 / 2                  \n"  // VB -0.08131 coefficient
-      "vmov.s16    q14, #107 / 2                 \n"  // VG -0.41869 coefficient
+      "vmov.s16    q10, #127                     \n"  // UB / VR 0.500 coefficient
+      "vmov.s16    q11, #84                      \n"  // UG -0.33126 coefficient
+      "vmov.s16    q12, #43                      \n"  // UR -0.16874 coefficient
+      "vmov.s16    q13, #20                      \n"  // VB -0.08131 coefficient
+      "vmov.s16    q14, #107                     \n"  // VG -0.41869 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
@@ -1998,9 +1997,9 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
       "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
       "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
 
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
+      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q1, #2                    \n"
+      "vrshr.u16   q2, q2, #2                    \n"
 
     RGBTOUV(q0, q1, q2)
       "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
@@ -2024,11 +2023,11 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
                        int width) {
   asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_argb
-      "vmov.s16    q10, #127 / 2                 \n"  // UB / VR 0.500 coefficient
-      "vmov.s16    q11, #84 / 2                  \n"  // UG -0.33126 coefficient
-      "vmov.s16    q12, #43 / 2                  \n"  // UR -0.16874 coefficient
-      "vmov.s16    q13, #20 / 2                  \n"  // VB -0.08131 coefficient
-      "vmov.s16    q14, #107 / 2                 \n"  // VG -0.41869 coefficient
+      "vmov.s16    q10, #127                     \n"  // UB / VR 0.500 coefficient
+      "vmov.s16    q11, #84                      \n"  // UG -0.33126 coefficient
+      "vmov.s16    q12, #43                      \n"  // UR -0.16874 coefficient
+      "vmov.s16    q13, #20                      \n"  // VB -0.08131 coefficient
+      "vmov.s16    q14, #107                     \n"  // VG -0.41869 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ABGR pixels.
@@ -2043,9 +2042,9 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
       "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
       "vpadal.u8   q2, q6                        \n"  // B 16 bytes -> 8 shorts.
 
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
+      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q1, #2                    \n"
+      "vrshr.u16   q2, q2, #2                    \n"
 
     RGBTOUV(q2, q1, q0)
       "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
@@ -2070,11 +2069,11 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
                         int width) {
   asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_rgb24
-      "vmov.s16    q10, #127 / 2                 \n"  // UB / VR 0.500 coefficient
-      "vmov.s16    q11, #84 / 2                  \n"  // UG -0.33126 coefficient
-      "vmov.s16    q12, #43 / 2                  \n"  // UR -0.16874 coefficient
-      "vmov.s16    q13, #20 / 2                  \n"  // VB -0.08131 coefficient
-      "vmov.s16    q14, #107 / 2                 \n"  // VG -0.41869 coefficient
+      "vmov.s16    q10, #127                     \n"  // UB / VR 0.500 coefficient
+      "vmov.s16    q11, #84                      \n"  // UG -0.33126 coefficient
+      "vmov.s16    q12, #43                      \n"  // UR -0.16874 coefficient
+      "vmov.s16    q13, #20                      \n"  // VB -0.08131 coefficient
+      "vmov.s16    q14, #107                     \n"  // VG -0.41869 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
       "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RGB24 pixels.
@@ -2089,9 +2088,9 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
       "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
       "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
 
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
+      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q1, #2                    \n"
+      "vrshr.u16   q2, q2, #2                    \n"
 
     RGBTOUV(q0, q1, q2)
       "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
@@ -2116,11 +2115,11 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
                       int width) {
   asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_raw
-      "vmov.s16    q10, #127 / 2                 \n"  // UB / VR 0.500 coefficient
-      "vmov.s16    q11, #84 / 2                  \n"  // UG -0.33126 coefficient
-      "vmov.s16    q12, #43 / 2                  \n"  // UR -0.16874 coefficient
-      "vmov.s16    q13, #20 / 2                  \n"  // VB -0.08131 coefficient
-      "vmov.s16    q14, #107 / 2                 \n"  // VG -0.41869 coefficient
+      "vmov.s16    q10, #127                     \n"  // UB / VR 0.500 coefficient
+      "vmov.s16    q11, #84                      \n"  // UG -0.33126 coefficient
+      "vmov.s16    q12, #43                      \n"  // UR -0.16874 coefficient
+      "vmov.s16    q13, #20                      \n"  // VB -0.08131 coefficient
+      "vmov.s16    q14, #107                     \n"  // VG -0.41869 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
       "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RAW pixels.
@@ -2135,9 +2134,9 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
       "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
       "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
 
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
+      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q1, #2                    \n"
+      "vrshr.u16   q2, q2, #2                    \n"
 
     RGBTOUV(q2, q1, q0)
       "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
@@ -2161,11 +2160,11 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
                       int width) {
   asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_bgra
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 BGRA pixels.
@@ -2180,9 +2179,9 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
       "vpadal.u8   q2, q6                        \n"  // G 16 bytes -> 8 shorts.
       "vpadal.u8   q1, q5                        \n"  // R 16 bytes -> 8 shorts.
 
-      "vrshr.u16   q1, q1, #1                    \n"  // 2x average
-      "vrshr.u16   q2, q2, #1                    \n"
-      "vrshr.u16   q3, q3, #1                    \n"
+      "vrshr.u16   q1, q1, #2                    \n"  // average of 4
+      "vrshr.u16   q2, q2, #2                    \n"
+      "vrshr.u16   q3, q3, #2                    \n"
 
     RGBTOUV(q3, q2, q1)
       "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
@@ -2190,7 +2189,7 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
       "bgt         1b                            \n"
   : "+r"(src_bgra),  // %0
     "+r"(src_stride_bgra),  // %1
-    "+r"(dst_u),     // %2
+    "+r"(dst_u),     // %2-
     "+r"(dst_v),     // %3
     "+r"(width)        // %4
   :
@@ -2206,11 +2205,11 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
                       int width) {
   asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_abgr
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ABGR pixels.
@@ -2225,9 +2224,9 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
       "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
       "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
 
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
+      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q1, #2                    \n"
+      "vrshr.u16   q2, q2, #2                    \n"
 
     RGBTOUV(q2, q1, q0)
       "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
@@ -2251,11 +2250,11 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
                       int width) {
   asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_rgba
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 RGBA pixels.
@@ -2270,9 +2269,9 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
       "vpadal.u8   q1, q6                        \n"  // G 16 bytes -> 8 shorts.
       "vpadal.u8   q2, q7                        \n"  // R 16 bytes -> 8 shorts.
 
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
+      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q1, #2                    \n"
+      "vrshr.u16   q2, q2, #2                    \n"
 
     RGBTOUV(q0, q1, q2)
       "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
@@ -2296,11 +2295,11 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
                        int width) {
   asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_rgb24
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
       "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RGB24 pixels.
@@ -2315,9 +2314,9 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
       "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
       "vpadal.u8   q2, q6                        \n"  // R 16 bytes -> 8 shorts.
 
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
+      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q1, #2                    \n"
+      "vrshr.u16   q2, q2, #2                    \n"
 
     RGBTOUV(q0, q1, q2)
       "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
@@ -2341,11 +2340,11 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
                      int width) {
   asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_raw
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875 coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
       "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RAW pixels.
@@ -2360,9 +2359,9 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
       "vpadal.u8   q1, q5                        \n"  // G 16 bytes -> 8 shorts.
       "vpadal.u8   q0, q4                        \n"  // R 16 bytes -> 8 shorts.
 
-      "vrshr.u16   q0, q0, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q1, #1                    \n"
-      "vrshr.u16   q2, q2, #1                    \n"
+      "vrshr.u16   q0, q0, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q1, #2                    \n"
+      "vrshr.u16   q2, q2, #2                    \n"
 
     RGBTOUV(q2, q1, q0)
       "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
@@ -2387,12 +2386,11 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
                         int width) {
   asm volatile(
       "add         %1, %0, %1                    \n"  // src_stride + src_argb
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
-                                                      // coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
@@ -2418,9 +2416,9 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
       "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
       "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
 
-      "vrshr.u16   q4, q4, #1                    \n"  // 2x average
-      "vrshr.u16   q5, q5, #1                    \n"
-      "vrshr.u16   q6, q6, #1                    \n"
+      "vrshr.u16   q4, q4, #2                    \n"  // average of 4
+      "vrshr.u16   q5, q5, #2                    \n"
+      "vrshr.u16   q6, q6, #2                    \n"
 
       "vmul.s16    q8, q4, q10                   \n"  // B
       "vmls.s16    q8, q5, q11                   \n"  // G
@@ -2453,12 +2451,11 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
                           int width) {
   asm volatile(
       "add         %1, %0, %1                    \n"  // src_stride + src_argb
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
-                                                      // coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
@@ -2484,9 +2481,9 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
       "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
       "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
 
-      "vrshr.u16   q4, q4, #1                    \n"  // 2x average
-      "vrshr.u16   q5, q5, #1                    \n"
-      "vrshr.u16   q6, q6, #1                    \n"
+      "vrshr.u16   q4, q4, #2                    \n"  // average of 4
+      "vrshr.u16   q5, q5, #2                    \n"
+      "vrshr.u16   q6, q6, #2                    \n"
 
       "vmul.s16    q8, q4, q10                   \n"  // B
       "vmls.s16    q8, q5, q11                   \n"  // G
@@ -2519,12 +2516,11 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
                           int width) {
   asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_argb
-      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
-                                                      // coefficient
-      "vmov.s16    q11, #74 / 2                  \n"  // UG -0.5781 coefficient
-      "vmov.s16    q12, #38 / 2                  \n"  // UR -0.2969 coefficient
-      "vmov.s16    q13, #18 / 2                  \n"  // VB -0.1406 coefficient
-      "vmov.s16    q14, #94 / 2                  \n"  // VG -0.7344 coefficient
+      "vmov.s16    q10, #112                     \n"  // UB/VR 0.875 coefficient
+      "vmov.s16    q11, #74                      \n"  // UG -0.5781 coefficient
+      "vmov.s16    q12, #38                      \n"  // UR -0.2969 coefficient
+      "vmov.s16    q13, #18                      \n"  // VB -0.1406 coefficient
+      "vmov.s16    q14, #94                      \n"  // VG -0.7344 coefficient
       "vmov.u16    q15, #0x8080                  \n"  // 128.5
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
@@ -2550,9 +2546,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
       "vpadal.u8   d11, d1                       \n"  // G 8 bytes -> 4 shorts.
       "vpadal.u8   d13, d2                       \n"  // R 8 bytes -> 4 shorts.
 
-      "vrshr.u16   q0, q4, #1                    \n"  // 2x average
-      "vrshr.u16   q1, q5, #1                    \n"
-      "vrshr.u16   q2, q6, #1                    \n"
+      "vrshr.u16   q0, q4, #2                    \n"  // average of 4
+      "vrshr.u16   q1, q5, #2                    \n"
+      "vrshr.u16   q2, q6, #2                    \n"
 
       RGBTOUV(q0, q1, q2)
       "vst1.8      {d0}, [%2]!                   \n"  // store 8 pixels U.
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index c30ef680c..71e132876 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2853,13 +2853,13 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
                                  &kRgb24JPEGUVConstants);
 }
 
-#define RGBTOUV_SETUP_REG                                                  \
-  "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
-  "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
-  "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
-  "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
-  "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
-  "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
+#define RGBTOUV_SETUP_REG                                             \
+  "movi       v20.8h, #112     \n" /* UB/VR coefficient  (0.875)   */ \
+  "movi       v21.8h, #74      \n" /* UG coefficient    (-0.5781)  */ \
+  "movi       v22.8h, #38      \n" /* UR coefficient    (-0.2969)  */ \
+  "movi       v23.8h, #18      \n" /* VB coefficient    (-0.1406)  */ \
+  "movi       v24.8h, #94      \n" /* VG coefficient    (-0.7344)  */ \
+  "movi       v25.16b, #0x80   \n" /* 128.5 (0x8080 in 16-bit)      */
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
 // clang-format off
@@ -2899,9 +2899,9 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v1.8h, #1              \n"
-      "urshr       v2.8h, v2.8h, #1              \n"
+      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
+      "urshr       v1.8h, v1.8h, #2              \n"
+      "urshr       v2.8h, v2.8h, #2              \n"
 
     RGBTOUV(v0.8h, v1.8h, v2.8h)
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
@@ -2918,7 +2918,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
   );
 }
 
-// TODO(fbarchard): Subsample match Intel code.
 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
@@ -2926,11 +2925,11 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
                        int width) {
   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
-      "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
-      "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
-      "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
-      "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
-      "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
+      "movi        v20.8h, #127                  \n"  // UB/VR coeff (0.500)
+      "movi        v21.8h, #84                   \n"  // UG coeff (-0.33126)
+      "movi        v22.8h, #43                   \n"  // UR coeff (-0.16874)
+      "movi        v23.8h, #20                   \n"  // VB coeff (-0.08131)
+      "movi        v24.8h, #107                  \n"  // VG coeff (-0.41869)
       "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
@@ -2945,9 +2944,9 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v1.8h, #1              \n"
-      "urshr       v2.8h, v2.8h, #1              \n"
+      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
+      "urshr       v1.8h, v1.8h, #2              \n"
+      "urshr       v2.8h, v2.8h, #2              \n"
 
     RGBTOUV(v0.8h, v1.8h, v2.8h)
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
@@ -2971,11 +2970,11 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
                        int width) {
   const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
   asm volatile (
-      "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
-      "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
-      "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
-      "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
-      "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
+      "movi        v20.8h, #127                  \n"  // UB/VR coeff (0.500)
+      "movi        v21.8h, #84                   \n"  // UG coeff (-0.33126)
+      "movi        v22.8h, #43                   \n"  // UR coeff (-0.16874)
+      "movi        v23.8h, #20                   \n"  // VB coeff (-0.08131)
+      "movi        v24.8h, #107                  \n"  // VG coeff (-0.41869)
       "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
@@ -2990,9 +2989,9 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v1.8h, #1              \n"
-      "urshr       v2.8h, v2.8h, #1              \n"
+      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
+      "urshr       v1.8h, v1.8h, #2              \n"
+      "urshr       v2.8h, v2.8h, #2              \n"
 
     RGBTOUV(v2.8h, v1.8h, v0.8h)
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
@@ -3016,11 +3015,11 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
                         int width) {
   const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
   asm volatile (
-      "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
-      "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
-      "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
-      "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
-      "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
+      "movi        v20.8h, #127                  \n"  // UB/VR coeff (0.500)
+      "movi        v21.8h, #84                   \n"  // UG coeff (-0.33126)
+      "movi        v22.8h, #43                   \n"  // UR coeff (-0.16874)
+      "movi        v23.8h, #20                   \n"  // VB coeff (-0.08131)
+      "movi        v24.8h, #107                  \n"  // VG coeff (-0.41869)
       "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
       "1:                                        \n"
       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
@@ -3035,9 +3034,9 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v1.8h, #1              \n"
-      "urshr       v2.8h, v2.8h, #1              \n"
+      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
+      "urshr       v1.8h, v1.8h, #2              \n"
+      "urshr       v2.8h, v2.8h, #2              \n"
 
     RGBTOUV(v0.8h, v1.8h, v2.8h)
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
@@ -3061,11 +3060,11 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
                       int width) {
   const uint8_t* src_raw_1 = src_raw + src_stride_raw;
   asm volatile (
-      "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
-      "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
-      "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
-      "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
-      "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
+      "movi        v20.8h, #127                  \n"  // UB/VR coeff (0.500)
+      "movi        v21.8h, #84                   \n"  // UG coeff (-0.33126)
+      "movi        v22.8h, #43                   \n"  // UR coeff (-0.16874)
+      "movi        v23.8h, #20                   \n"  // VB coeff (-0.08131)
+      "movi        v24.8h, #107                  \n"  // VG coeff (-0.41869)
       "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
       "1:                                        \n"
       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
@@ -3080,9 +3079,9 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v1.8h, #1              \n"
-      "urshr       v2.8h, v2.8h, #1              \n"
+      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
+      "urshr       v1.8h, v1.8h, #2              \n"
+      "urshr       v2.8h, v2.8h, #2              \n"
 
     RGBTOUV(v2.8h, v1.8h, v0.8h)
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
@@ -3120,9 +3119,9 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
       "uadalp      v3.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v2.8h, v5.16b                 \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v3.8h, #1              \n"
-      "urshr       v2.8h, v2.8h, #1              \n"
+      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
+      "urshr       v1.8h, v3.8h, #2              \n"
+      "urshr       v2.8h, v2.8h, #2              \n"
 
     RGBTOUV(v0.8h, v1.8h, v2.8h)
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
@@ -3160,9 +3159,9 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
       "uadalp      v2.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v1.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v3.8h, #1              \n"  // 2x average
-      "urshr       v2.8h, v2.8h, #1              \n"
-      "urshr       v1.8h, v1.8h, #1              \n"
+      "urshr       v0.8h, v3.8h, #2              \n"  // average of 4
+      "urshr       v2.8h, v2.8h, #2              \n"
+      "urshr       v1.8h, v1.8h, #2              \n"
 
     RGBTOUV(v0.8h, v2.8h, v1.8h)
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
@@ -3200,9 +3199,9 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
       "uadalp      v1.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v2.8h, v7.16b                 \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v1.8h, #1              \n"
-      "urshr       v2.8h, v2.8h, #1              \n"
+      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
+      "urshr       v1.8h, v1.8h, #2              \n"
+      "urshr       v2.8h, v2.8h, #2              \n"
 
     RGBTOUV(v0.8h, v1.8h, v2.8h)
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
@@ -3240,9 +3239,9 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v1.8h, #1              \n"
-      "urshr       v2.8h, v2.8h, #1              \n"
+      "urshr       v0.8h, v0.8h, #2              \n"  // average of 4
+      "urshr       v1.8h, v1.8h, #2              \n"
+      "urshr       v2.8h, v2.8h, #2              \n"
 
     RGBTOUV(v0.8h, v1.8h, v2.8h)
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
@@ -3280,9 +3279,9 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v2.8h, v2.8h, #1              \n"  // 2x average
-      "urshr       v1.8h, v1.8h, #1              \n"
-      "urshr       v0.8h, v0.8h, #1              \n"
+      "urshr       v2.8h, v2.8h, #2              \n"  // average of 4
+      "urshr       v1.8h, v1.8h, #2              \n"
+      "urshr       v0.8h, v0.8h, #2              \n"
 
     RGBTOUV(v2.8h, v1.8h, v0.8h)
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
@@ -3324,9 +3323,9 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
       "uadalp      v17.8h, v1.16b                \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v18.8h, v2.16b                \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
-      "urshr       v1.8h, v17.8h, #1             \n"
-      "urshr       v2.8h, v18.8h, #1             \n"
+      "urshr       v0.8h, v16.8h, #2             \n"  // average of 4
+      "urshr       v1.8h, v17.8h, #2             \n"
+      "urshr       v2.8h, v18.8h, #2             \n"
 
       RGBTOUV(v0.8h, v1.8h, v2.8h)
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
@@ -3368,9 +3367,9 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
       "uadalp      v17.8h, v1.16b                \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v18.8h, v2.16b                \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
-      "urshr       v1.8h, v17.8h, #1             \n"
-      "urshr       v2.8h, v18.8h, #1             \n"
+      "urshr       v0.8h, v16.8h, #2             \n"  // average of 4
+      "urshr       v1.8h, v17.8h, #2             \n"
+      "urshr       v2.8h, v18.8h, #2             \n"
 
       RGBTOUV(v0.8h, v1.8h, v2.8h)
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
@@ -3412,9 +3411,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
       "uadalp      v17.8h, v1.16b                \n"  // G 16 bytes -> 8 shorts.
       "uadalp      v18.8h, v2.16b                \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
-      "urshr       v1.8h, v17.8h, #1             \n"
-      "urshr       v2.8h, v18.8h, #1             \n"
+      "urshr       v0.8h, v16.8h, #2             \n"  // average of 4
+      "urshr       v1.8h, v17.8h, #2             \n"
+      "urshr       v2.8h, v18.8h, #2             \n"
 
       RGBTOUV(v0.8h, v1.8h, v2.8h)
       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
diff --git a/source/row_sve.cc b/source/row_sve.cc
index 0bab8e16f..27bf87a6c 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -205,32 +205,32 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
 
 static const int16_t kARGBToUVCoefficients[] = {
     // UB, -UR, -UG, 0, -VB, VR, -VG, 0
-    56, -19, -37, 0, -9, 56, -47, 0,
+    112, -38, -74, 0, -18, 112, -94, 0,
 };
 
 static const int16_t kRGBAToUVCoefficients[] = {
     // 0, -UG, UB, -UR, 0, -VG, -VB, VR
-    0, -37, 56, -19, 0, -47, -9, 56,
+    0, -74, 112, -38, 0, -94, -18, 112,
 };
 
 static const int16_t kBGRAToUVCoefficients[] = {
     // 0, -UG, -UR, UB, 0, -VG, VR, -VB
-    0, -37, -19, 56, 0, -47, 56, -9,
+    0, -74, -38, 112, 0, -94, 112, -18,
 };
 
 static const int16_t kABGRToUVCoefficients[] = {
     // -UR, UB, -UG, 0, VR, -VB, -VG, 0
-    -19, 56, -37, 0, 56, -9, -47, 0,
+    -38, 112, -74, 0, 112, -18, -94, 0,
 };
 
 static const int16_t kARGBToUVJCoefficients[] = {
     // UB, -UR, -UG, 0, -VB, VR, -VG, 0
-    63, -21, -42, 0, -10, 63, -53, 0,
+    127, -43, -84, 0, -20, 127, -107, 0,
 };
 
 static const int16_t kABGRToUVJCoefficients[] = {
     // -UR, UB, -UG, 0, VR, -VB, -VG, 0
-    -21, 63, -42, 0, 63, -10, -53, 0,
+    -43, 127, -84, 0, 127, -20, -107, 0,
 };
 
 static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
@@ -285,10 +285,15 @@ static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
 
       "subs     %w[width], %w[width], %w[vl]        \n"  // 4*VL per loop
 
-      "urhadd   z0.h, p0/m, z0.h, z1.h              \n"  // brgabrga
-      "urhadd   z2.h, p0/m, z2.h, z3.h              \n"  // brgabrga
-      "urhadd   z4.h, p0/m, z4.h, z5.h              \n"  // brgabrga
-      "urhadd   z6.h, p0/m, z6.h, z7.h              \n"  // brgabrga
+      "add      z0.h, p0/m, z0.h, z1.h              \n"  // brgabrga
+      "add      z2.h, p0/m, z2.h, z3.h              \n"  // brgabrga
+      "add      z4.h, p0/m, z4.h, z5.h              \n"  // brgabrga
+      "add      z6.h, p0/m, z6.h, z7.h              \n"  // brgabrga
+
+      "urshr    z0.h, p0/m, z0.h, #2                \n"  // brgabrga
+      "urshr    z2.h, p0/m, z2.h, #2                \n"  // brgabrga
+      "urshr    z4.h, p0/m, z4.h, #2                \n"  // brgabrga
+      "urshr    z6.h, p0/m, z6.h, #2                \n"  // brgabrga
 
       "movi     v16.8h, #0                          \n"
       "movi     v17.8h, #0                          \n"
@@ -350,7 +355,9 @@ static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
       "trn1     z0.s, z16.s, z17.s                  \n"  // brgabgra
       "trn2     z1.s, z16.s, z17.s                  \n"  // brgabgra
 
-      "urhadd   z0.h, p0/m, z0.h, z1.h              \n"  // brgabrga
+      "add      z0.h, p0/m, z0.h, z1.h              \n"  // brgabrga
+
+      "urshr    z0.h, p0/m, z0.h, #2                \n"  // brgabrga
 
       "subs     %w[width], %w[width], %w[vl]        \n"  // VL per loop
 
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index be36343b0..eb0d4bbd9 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -2076,7 +2076,7 @@ TEST_F(LibYUVConvertTest, TestRGB24ToJ420) {
   }
 
   uint32_t checksum = HashDjb2(dest_j420, kSize * 3 / 2 * 2, 5381);
-  EXPECT_EQ(2755440272u, checksum);
+  EXPECT_EQ(4157186353u, checksum);
 
   free_aligned_buffer_page_end(orig_rgb24);
   free_aligned_buffer_page_end(dest_j420);