diff --git a/BUILD.gn b/BUILD.gn
index 01b023ee6..04bf80fff 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -122,6 +122,10 @@ static_library("libyuv") {
     # Enable optimize for speed (-O2) over size (-Os).
     configs += [ "//build/config/compiler:optimize_max" ]
   }
+
+  # To enable AVX2 or other cpu optimization, pass flag here
+  #  cflags = [ "-mavx2" ]
+
 }
 
 if (libyuv_use_neon) {
@@ -140,6 +144,14 @@ if (libyuv_use_neon) {
 
     public_configs = [ ":libyuv_config" ]
 
+    # Always enable optimization for Release and NaCl builds (to workaround
+    # crbug.com/538243).
+    if (!is_debug) {
+      configs -= [ "//build/config/compiler:default_optimization" ]
+      # Enable optimize for speed (-O2) over size (-Os).
+      configs += [ "//build/config/compiler:optimize_max" ]
+    }
+
     if (current_cpu != "arm64") {
       configs -= [ "//build/config/compiler:compiler_arm_fpu" ]
       cflags = [ "-mfpu=neon" ]
diff --git a/README.chromium b/README.chromium
index 22c4937e3..159826180 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1627
+Version: 1628
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index ef3efa5f3..96861befb 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -201,7 +201,7 @@ extern "C" {
 #define HAS_COPYROW_AVX
 #define HAS_H422TOARGBROW_AVX2
 #define HAS_HALFFLOATROW_AVX2
-// #define HAS_HALFFLOATROW_F16C  // Enable to test halffloat cast
+//  #define HAS_HALFFLOATROW_F16C  // Enable to test halffloat cast
 #define HAS_I400TOARGBROW_AVX2
 #define HAS_I422TOARGB1555ROW_AVX2
 #define HAS_I422TOARGB4444ROW_AVX2
@@ -330,6 +330,11 @@ extern "C" {
 #define HAS_YUY2TOUVROW_NEON
 #define HAS_YUY2TOYROW_NEON
 
+// TODO(fbarchard): Port to 32 bit.
+#if defined(__aarch64__)
+#define HAS_HALFFLOATROW_NEON
+#endif
+
 // Effects:
 #define HAS_ARGBADDROW_NEON
 #define HAS_ARGBATTENUATEROW_NEON
@@ -1954,6 +1959,9 @@ void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale,
 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width);
 void HalfFloatRow_Any_F16C(const uint16* src, uint16* dst, float scale,
                            int width);
+void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloatRow_Any_NEON(const uint16* src, uint16* dst, float scale,
+                           int width);
 
 void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
                              const uint8* luma, uint32 lumacoeff);
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 42835951f..cebc731db 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1627
+#define LIBYUV_VERSION 1628
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index da4c47b83..143ae869d 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -2585,6 +2585,15 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_HALFFLOATROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    HalfFloatRow = HalfFloatRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      HalfFloatRow = HalfFloatRow_NEON;
+    }
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     HalfFloatRow(src_y, dst_y, scale, width);
     src_y += src_stride_y;
diff --git a/source/row_any.cc b/source/row_any.cc
index dbed89189..ec0aa21d7 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -585,6 +585,9 @@ ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
 #ifdef HAS_HALFFLOATROW_F16C
 ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15)
 #endif
+#ifdef HAS_HALFFLOATROW_NEON
+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 1, 1, 7)
+#endif
 #undef ANY11P16
 
 // Any 1 to 1 with yuvconstants
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index d62762dab..3d122680e 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2710,6 +2710,32 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
+
+void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
+    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
+    "uxtl2      v1.4s, v1.8h                   \n"
+    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+    "scvtf      v1.4s, v1.4s                   \n"
+    "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
+    "fmul       v1.4s, v1.4s, %3.s[0]          \n"
+    "uqshrn     v4.4h, v2.4s, #13              \n"  // isolate halffloat
+    "uqshrn2    v4.8h, v1.4s, #13              \n"
+   MEMACCESS(1)
+    "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
+    "b.gt       1b                             \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "w"(scale * 1.9259299444e-34f)    // %3
+  : "cc", "memory", "v1", "v2", "v4"
+  );
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 722074f73..c552c4a59 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2081,9 +2081,12 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
   }
 }
 
-TEST_F(LibYUVPlanarTest, TestHalfFloatPlane) {
+int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
+                       int benchmark_iterations,
+                       int disable_cpu_flags, int benchmark_cpu_info,
+                       float scale) {
   int i, j;
-  const int y_plane_size = benchmark_width_ * benchmark_height_ * 2;
+  const int y_plane_size = benchmark_width * benchmark_height * 2;
 
   align_buffer_page_end(orig_y, y_plane_size);
   align_buffer_page_end(dst_c, y_plane_size);
@@ -2093,32 +2096,62 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane) {
   memset(dst_opt, 1, y_plane_size);
 
   // Disable all optimizations.
-  MaskCpuFlags(disable_cpu_flags_);
+  MaskCpuFlags(disable_cpu_flags);
   double c_time = get_time();
-  for (j = 0; j < benchmark_iterations_; j++) {
-    HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2,
-                   (uint16*)dst_c, benchmark_width_ * 2,
-                   1.0f / 4096.0f, benchmark_width_, benchmark_height_);
+  for (j = 0; j < benchmark_iterations; j++) {
+    HalfFloatPlane((uint16*)orig_y, benchmark_width * 2,
+                   (uint16*)dst_c, benchmark_width * 2,
+                   scale, benchmark_width, benchmark_height);
   }
-  c_time = (get_time() - c_time) / benchmark_iterations_;
+  c_time = (get_time() - c_time) / benchmark_iterations;
 
   // Enable optimizations.
-  MaskCpuFlags(benchmark_cpu_info_);
+  MaskCpuFlags(benchmark_cpu_info);
   double opt_time = get_time();
-  for (j = 0; j < benchmark_iterations_; j++) {
-    HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2,
-                   (uint16*)dst_opt, benchmark_width_ * 2,
-                   1.0f / 4096.0f, benchmark_width_, benchmark_height_);
+  for (j = 0; j < benchmark_iterations; j++) {
+    HalfFloatPlane((uint16*)orig_y, benchmark_width * 2,
+                   (uint16*)dst_opt, benchmark_width * 2,
+                   scale, benchmark_width, benchmark_height);
   }
-  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
 
+  int diff = 0;
   for (i = 0; i < y_plane_size; ++i) {
-    EXPECT_EQ(dst_c[i], dst_opt[i]);
+    diff = dst_c[i] - dst_opt[i];
+    if (diff) break;
   }
 
   free_aligned_buffer_page_end(orig_y);
   free_aligned_buffer_page_end(dst_c);
   free_aligned_buffer_page_end(dst_opt);
+  return diff;
+}
+
+// 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
+// exponent to be less than 0.  15 - log2(65536) = -1/  This shouldnt normally
+// happen since scale is 1/(1<<bits) where bits is 9, 10 or 12.
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_denormal) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f / 65536.0f);
+  EXPECT_EQ(diff, 0);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f / 4096.0f);
+  EXPECT_EQ(diff, 0);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Offby1) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f / 1023.0f);
+  EXPECT_EQ(diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {