diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index ae9f595c8..e51155c08 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -441,6 +441,7 @@ extern "C" {
 #define HAS_BGRATOUVROW_NEON
 #define HAS_BGRATOYROW_NEON
 #define HAS_BYTETOFLOATROW_NEON
+#define HAS_CONVERT16TO8ROW_NEON
 #define HAS_COPYROW_NEON
 #define HAS_DETILEROW_NEON
 #define HAS_DETILESPLITUVROW_NEON
@@ -2596,6 +2597,14 @@ void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
                               uint8_t* dst_ptr,
                               int scale,
                               int width);
+void Convert16To8Row_NEON(const uint16_t* src_y,
+                          uint8_t* dst_y,
+                          int scale,
+                          int width);
+void Convert16To8Row_Any_NEON(const uint16_t* src_y,
+                              uint8_t* dst_y,
+                              int scale,
+                              int width);
 
 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 53e746794..c662cfef6 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -147,6 +147,14 @@ void Convert16To8Plane(const uint16_t* src_y,
     height = 1;
     src_stride_y = dst_stride_y = 0;
   }
+#if defined(HAS_CONVERT16TO8ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Convert16To8Row = Convert16To8Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      Convert16To8Row = Convert16To8Row_NEON;
+    }
+  }
+#endif
 #if defined(HAS_CONVERT16TO8ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     Convert16To8Row = Convert16To8Row_Any_SSSE3;
diff --git a/source/row_any.cc b/source/row_any.cc
index 2d30e0a56..089e518af 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -1481,6 +1481,15 @@ ANY11C(Convert16To8Row_Any_AVX2,
        uint8_t,
        31)
 #endif
+#ifdef HAS_CONVERT16TO8ROW_NEON
+ANY11C(Convert16To8Row_Any_NEON,
+       Convert16To8Row_NEON,
+       2,
+       1,
+       uint16_t,
+       uint8_t,
+       15)
+#endif
 #ifdef HAS_CONVERT8TO16ROW_SSE2
 ANY11C(Convert8To16Row_Any_SSE2,
        Convert8To16Row_SSE2,
diff --git a/source/row_neon.cc b/source/row_neon.cc
index cda171ada..8ba71d07e 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -3599,7 +3599,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
                          int scale,
                          int width) {
   asm volatile(
-      "vdup.16     q2, %2                        \n"
+      "vdup.16     q2, %3                        \n"
       "1:                                        \n"
       "vld1.16     {q0}, [%0]!                   \n"
       "vld1.16     {q1}, [%0]!                   \n"
@@ -3607,13 +3607,12 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
       "vmul.u16    q1, q1, q2                    \n"
       "vst1.16     {q0}, [%1]!                   \n"
       "vst1.16     {q1}, [%1]!                   \n"
-      "subs        %3, %3, #16                   \n"  // 16 src pixels per loop
+      "subs        %2, %2, #16                   \n"  // 16 src pixels per loop
       "bgt         1b                            \n"
       : "+r"(src_y),  // %0
         "+r"(dst_y),  // %1
-        "+r"(scale),  // %2
-        "+r"(width)   // %3
-      :
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
       : "cc", "memory", "q0", "q1", "q2");
 }
 
@@ -3622,7 +3621,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        int scale,
                        int width) {
   asm volatile(
-      "vdup.16     q0, %2                        \n"
+      "vdup.16     q0, %3                        \n"
       "1:                                        \n"
       "vld1.16     {q1}, [%0]!                   \n"
       "vld1.16     {q2}, [%0]!                   \n"
@@ -3640,6 +3639,34 @@ void DivideRow_16_NEON(const uint16_t* src_y,
       "vmovn.u32   d5, q2                        \n"
       "vst1.16     {q1}, [%1]!                   \n"
       "vst1.16     {q2}, [%1]!                   \n"
+      "subs        %2, %2, #16                   \n"  // 16 src pixels per loop
+      "bgt         1b                            \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_NEON(const uint16_t* src_y,
+                          uint8_t* dst_y,
+                          int scale,
+                          int width) {
+  asm volatile(
+      "vdup.16     q2, %2                        \n"
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"
+      "vld1.16     {q1}, [%0]!                   \n"
+      "vqdmulh.s16 q0, q0, q2                    \n"
+      "vqdmulh.s16 q1, q1, q2                    \n"
+      "vqshrn.u16  d0, q0, #1                    \n"
+      "vqshrn.u16  d1, q1, #1                    \n"
+      "vst1.16     {q0}, [%1]!                   \n"
       "subs        %3, %3, #16                   \n"  // 16 src pixels per loop
       "bgt         1b                            \n"
       : "+r"(src_y),  // %0
@@ -3647,7 +3674,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
         "+r"(scale),  // %2
         "+r"(width)   // %3
       :
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+      : "cc", "memory", "q0", "q1", "q2");
 }
 
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 27723810d..8d43d5940 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -4070,21 +4070,19 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
                          int scale,
                          int width) {
   asm volatile(
-      "dup         v2.8h, %w2                    \n"
+      "dup         v2.8h, %w3                    \n"
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"
       "mul         v0.8h, v0.8h, v2.8h           \n"
       "prfm        pldl1keep, [%0, 448]          \n"
       "mul         v1.8h, v1.8h, v2.8h           \n"
-      "stp         q0, q1, [%1]                  \n"  // store 16 pixels
-      "add         %1, %1, #32                   \n"
-      "subs        %w3, %w3, #16                 \n"  // 16 src pixels per loop
+      "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
+      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
       "b.gt        1b                            \n"
       : "+r"(src_y),  // %0
         "+r"(dst_y),  // %1
-        "+r"(scale),  // %2
-        "+r"(width)   // %3
-      :
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
       : "cc", "memory", "v0", "v1", "v2");
 }
 
@@ -4093,7 +4091,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        int scale,
                        int width) {
   asm volatile(
-      "dup         v0.8h, %w2                    \n"
+      "dup         v0.8h, %w3                    \n"
       "1:                                        \n"
       "ldp         q1, q2, [%0], #32             \n"
       "ushll       v3.4s, v1.4h, #0              \n"
@@ -4109,18 +4107,44 @@ void DivideRow_16_NEON(const uint16_t* src_y,
       "shrn        v4.4h, v4.4s, #16             \n"
       "shrn2       v3.8h, v1.4s, #16             \n"
       "shrn2       v4.8h, v2.4s, #16             \n"
-      "stp         q3, q3, [%1]                  \n"  // store 16 pixels
-      "add         %1, %1, #32                   \n"
-      "subs        %w3, %w3, #16                 \n"  // 16 src pixels per loop
+      "stp         q3, q3, [%1], #32             \n"  // store 16 pixels
+      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
       "b.gt        1b                            \n"
       : "+r"(src_y),  // %0
         "+r"(dst_y),  // %1
-        "+r"(scale),  // %2
-        "+r"(width)   // %3
-      :
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
 }
 
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_NEON(const uint16_t* src_y,
+                          uint8_t* dst_y,
+                          int scale,
+                          int width) {
+  asm volatile(
+      "dup         v2.8h, %w3                    \n"
+      "1:                                        \n"
+      "ldp         q0, q1, [%0], #32             \n"
+      "sqdmulh     v0.8h, v0.8h, v2.8h           \n"
+      "sqdmulh     v1.8h, v1.8h, v2.8h           \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
+      "uqshrn      v0.8b, v0.8h, #1              \n"
+      "uqshrn2     v0.16b, v1.8h, #1             \n"
+      "str         q0, [%1], #16                 \n"  // store 16 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus