diff --git a/README.chromium b/README.chromium
index ca11605ae..181e653ea 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1600
+Version: 1601
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 6434a8dbd..896d1d9b7 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1600
+#define LIBYUV_VERSION 1601
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/scale_common.cc b/source/scale_common.cc
index baed70b9d..3507aa4d9 100644
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -417,11 +417,9 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
 }
 
 // (1-f)a + fb can be replaced with a + f(b-a)
-#if defined(__arm__)
-// arm uses 16 bit math with truncation.
-// TODO(fbarchard): add rounding.
+#if defined(__arm__) || defined(__aarch64__)
 #define BLENDER(a, b, f) (uint8)((int)(a) + \
-    (((int)((f)) * ((int)(b) - (int)(a))) >> 16))
+    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
 #else
 // inteluses 7 bit math with rounding.
 #define BLENDER(a, b, f) (uint8)((int)(a) + \
@@ -480,7 +478,7 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
 
 // Same as 8 bit arm blender but return is cast to uint16
 #define BLENDER(a, b, f) (uint16)((int)(a) + \
-    (((int)((f)) * ((int)(b) - (int)(a))) >> 16))
+    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
 
 void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
                        int dst_width, int x, int dx) {
@@ -818,7 +816,7 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
   }
 }
 
-// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=605.
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
 // Mimics SSSE3 blender
 #define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
 #define BLENDERC(a, b, f, s) (uint32)( \
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index 26bb70592..44b0c8080 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -612,8 +612,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
     "vmovl.u16  q10, d21                       \n"
     "vmul.s32   q11, q11, q13                  \n"
     "vmul.s32   q12, q12, q10                  \n"
-    "vshrn.s32  d18, q11, #16                  \n"
-    "vshrn.s32  d19, q12, #16                  \n"
+    "vrshrn.s32  d18, q11, #16                 \n"
+    "vrshrn.s32  d19, q12, #16                 \n"
     "vadd.s16   q8, q8, q9                     \n"
     "vmovn.s16  d6, q8                         \n"
 
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index 3a62db5b8..ff277f26f 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -626,8 +626,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
     "ushll2    v6.4s, v6.8h, #0                \n"
     "mul       v16.4s, v16.4s, v7.4s           \n"
     "mul       v17.4s, v17.4s, v6.4s           \n"
-    "shrn      v6.4h, v16.4s, #16              \n"
-    "shrn2     v6.8h, v17.4s, #16              \n"
+    "rshrn      v6.4h, v16.4s, #16             \n"
+    "rshrn2     v6.8h, v17.4s, #16             \n"
     "add       v4.8h, v4.8h, v6.8h             \n"
     "xtn       v4.8b, v4.8h                    \n"