diff --git a/README.chromium b/README.chromium
index 4e8ed86f1..8e05a62b3 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1055
+Version: 1056
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index d5663fe8e..4b3c870f9 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -271,14 +271,14 @@ extern "C" {
 // #define HAS_NV21TORGB565ROW_NEON
 // #define HAS_YUY2TOARGBROW_NEON
 // #define HAS_UYVYTOARGBROW_NEON
-// #define HAS_SPLITUVROW_NEON
-// #define HAS_MERGEUVROW_NEON
-// #define HAS_COPYROW_NEON
-// #define HAS_SETROW_NEON
-// #define HAS_ARGBSETROWS_NEON
-// #define HAS_MIRRORROW_NEON
-// #define HAS_MIRRORUVROW_NEON
-// #define HAS_ARGBMIRRORROW_NEON
+#define HAS_SPLITUVROW_NEON
+#define HAS_MERGEUVROW_NEON
+#define HAS_COPYROW_NEON
+#define HAS_SETROW_NEON
+#define HAS_ARGBSETROWS_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
+#define HAS_ARGBMIRRORROW_NEON
 #define HAS_RGB24TOARGBROW_NEON
 #define HAS_RAWTOARGBROW_NEON
 // #define HAS_RGB565TOARGBROW_NEON
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 720779262..dfe9d6426 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1055
+#define LIBYUV_VERSION 1056
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 6e964433e..21111cf60 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -824,19 +824,19 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
+    "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store U
+    "st1        {v0.16b}, [%1], #16            \n"  // store U
     MEMACCESS(2)
-    "vst1.8     {q1}, [%2]!                    \n"  // store V
+    "st1        {v1.16b}, [%2], #16            \n"  // store V
     "bgt        1b                             \n"
     : "+r"(src_uv),  // %0
       "+r"(dst_u),   // %1
       "+r"(dst_v),   // %2
       "+r"(width)    // %3  // Output registers
     :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
+    : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_SPLITUVROW_NEON
@@ -849,12 +849,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load U
+    "ld1        {v0.16b}, [%0], #16            \n"  // load U
     MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load V
+    "ld1        {v1.16b}, [%1], #16            \n"  // load V
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     MEMACCESS(2)
-    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
+    "st2        {v0.16b, v1.16b}, [%2], #32    \n"  // store 16 pairs of UV
     "bgt        1b                             \n"
     :
       "+r"(src_u),   // %0
@@ -862,7 +862,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
       "+r"(dst_uv),  // %2
       "+r"(width)    // %3  // Output registers
     :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
+    : "cc", "memory", "v0", "v1"  // Clobber List
   );
 }
 #endif  // HAS_MERGEUVROW_NEON
@@ -874,16 +874,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
+    "ld1        {v0.8b-v3.8b}, [%0], #32       \n"  // load 32
     "subs       %2, %2, #32                    \n"  // 32 processed per loop
     MEMACCESS(1)
-    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
+    "st1        {v0.8b-v3.8b}, [%1], #32       \n"  // store 32
     "bgt        1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
     "+r"(count)  // %2  // Output registers
   :                     // Input registers
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_COPYROW_NEON
@@ -892,16 +892,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
 #ifdef HAS_SETROW_NEON
 void SetRow_NEON(uint8* dst, uint32 v32, int count) {
   asm volatile (
-    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
     "1:                                        \n"
     "subs      %1, %1, #16                     \n"  // 16 bytes per loop
     MEMACCESS(0)
-    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "st1        {v0.16b}, [%0], #16            \n"  // store
     "bgt       1b                              \n"
   : "+r"(dst),   // %0
     "+r"(count)  // %1
   : "r"(v32)     // %2
-  : "cc", "memory", "q0"
+  : "cc", "memory", "v0"
   );
 }
 #endif  // HAS_SETROW_NEON
@@ -922,26 +922,25 @@ void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
   asm volatile (
     // Start at end of source row.
-    "mov        r3, #-16                       \n"
     "add        %0, %0, %2                     \n"
-    "sub        %0, #16                        \n"
+    "sub        %0, %0, #16                    \n"
 
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #16                        \n"  // 16 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #16                    \n"  // 16 pixels per loop.
+    "rev64      v0.16b, v0.16b                 \n"
     MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
+    "st1        {v0.D}[0], [%1], #8            \n"
     "bgt        1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
     "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
   );
 }
 #endif  // HAS_MIRRORROW_NEON
@@ -951,27 +950,27 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                       int width) {
   asm volatile (
     // Start at end of source row.
-    "mov        r12, #-16                      \n"
     "add        %0, %0, %3, lsl #1             \n"
-    "sub        %0, #16                        \n"
+    "sub        %0, %0, #16                    \n"
 
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
-    "subs       %3, #8                         \n"  // 8 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
+    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
+    "subs       %3, %3, #8                     \n"  // 8 pixels per loop.
+    "rev64      v0.8b, v0.8b                   \n"
+    "rev64      v1.8b, v1.8b                   \n"
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+    "st1        {v0.8b}, [%1], #8               \n"  // dst += 8
     MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"
+    "st1        {v1.8b}, [%2], #8               \n"
     "bgt        1b                             \n"
   : "+r"(src_uv),  // %0
     "+r"(dst_u),   // %1
     "+r"(dst_v),   // %2
     "+r"(width)    // %3
-  :
-  : "cc", "memory", "r12", "q0"
+  : "r"((ptrdiff_t)-16)      // %4
+  : "cc", "memory", "v0", "v1"
   );
 }
 #endif  // HAS_MIRRORUVROW_NEON
@@ -980,26 +979,25 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
   asm volatile (
     // Start at end of source row.
-    "mov        r3, #-16                       \n"
     "add        %0, %0, %2, lsl #2             \n"
-    "sub        %0, #16                        \n"
+    "sub        %0, %0, #16                    \n"
 
     ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #4                         \n"  // 4 pixels per loop.
-    "vrev64.32  q0, q0                         \n"
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    "rev64      v0.4s, v0.4s                   \n"
     MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
+    "st1        {v0.D}[0], [%1], #8            \n"
     "bgt        1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
     "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
   );
 }
 #endif  // HAS_ARGBMIRRORROW_NEON