diff --git a/README.chromium b/README.chromium
index 33ba36edb..bffa74dec 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1706
+Version: 1707
 License: BSD
 License File: LICENSE
 
diff --git a/docs/getting_started.md b/docs/getting_started.md
index 09297b66a..2b986ab19 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -103,6 +103,10 @@ ios simulator
     ninja -v -C out/Debug libyuv_unittest
     ninja -v -C out/Release libyuv_unittest
 
+ios disassembly
+
+    otool -tV ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
+
 ### Android
 https://code.google.com/p/chromium/wiki/AndroidBuildInstructions
 
@@ -144,6 +148,8 @@ arm disassembly:
 
     third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
 
+    Caveat: Disassembly may require optimize_max be disabled in BUILD.gn
+
 Running tests:
 
     build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 5fad06b7c..ecc9f6a7a 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -55,9 +55,9 @@ extern "C" {
 #endif  // clang >= 3.4
 #endif  // __clang__
 
-// clang >= 6.0.0 required for AVX512.
+// clang >= 7.0.0 required for AVX512.
 #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-#if (__clang_major__ >= 6)
+#if (__clang_major__ >= 7)
 #define CLANG_HAS_AVX512 1
 #endif  // clang >= 6
 #endif  // __clang__
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index cf8fc3f6d..673976e47 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1706
+#define LIBYUV_VERSION 1707
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 8b6c19520..ff87e74c6 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -2604,12 +2604,15 @@ void SobelYRow_NEON(const uint8_t* src_y0,
       );
 }
 
+// %y passes a float as a scalar vector for vector * scalar multiply.
+// the regoster must be d0 to d15 and indexed with [0] or [1] to access
+// the float in the first or second float of the d-reg
+
 void HalfFloat1Row_NEON(const uint16_t* src,
                         uint16_t* dst,
                         float /*unused*/,
                         int width) {
   asm volatile(
-      "vdup.32    q0, %3                         \n"
 
       "1:                                        \n"
       "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
@@ -2618,8 +2621,8 @@ void HalfFloat1Row_NEON(const uint16_t* src,
       "vmovl.u16  q3, d3                         \n"
       "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
       "vcvt.f32.u32  q3, q3                      \n"
-      "vmul.f32   q2, q2, q0                     \n"  // adjust exponent
-      "vmul.f32   q3, q3, q0                     \n"
+      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent
+      "vmul.f32   q3, q3, %y3                    \n"
       "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
       "vqshrn.u32 d3, q3, #13                    \n"
       "vst1.8     {q1}, [%1]!                    \n"
@@ -2627,17 +2630,15 @@ void HalfFloat1Row_NEON(const uint16_t* src,
       : "+r"(src),              // %0
         "+r"(dst),              // %1
         "+r"(width)             // %2
-      : "r"(1.9259299444e-34f)  // %3
-      : "cc", "memory", "q0", "q1", "q2", "q3");
+      : "w"(1.9259299444e-34f)  // %3
+      : "cc", "memory", "q1", "q2", "q3");
 }
 
-// TODO(fbarchard): multiply by element.
 void HalfFloatRow_NEON(const uint16_t* src,
                        uint16_t* dst,
                        float scale,
                        int width) {
   asm volatile(
-      "vdup.32    q0, %3                         \n"
 
       "1:                                        \n"
       "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
@@ -2646,8 +2647,8 @@ void HalfFloatRow_NEON(const uint16_t* src,
       "vmovl.u16  q3, d3                         \n"
       "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
       "vcvt.f32.u32  q3, q3                      \n"
-      "vmul.f32   q2, q2, q0                     \n"  // adjust exponent
-      "vmul.f32   q3, q3, q0                     \n"
+      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent
+      "vmul.f32   q3, q3, %y3                    \n"
       "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
       "vqshrn.u32 d3, q3, #13                    \n"
       "vst1.8     {q1}, [%1]!                    \n"
@@ -2655,8 +2656,8 @@ void HalfFloatRow_NEON(const uint16_t* src,
       : "+r"(src),                      // %0
         "+r"(dst),                      // %1
         "+r"(width)                     // %2
-      : "r"(scale * 1.9259299444e-34f)  // %3
-      : "cc", "memory", "q0", "q1", "q2", "q3");
+      : "w"(scale * 1.9259299444e-34f)  // %3
+      : "cc", "memory", "q1", "q2", "q3");
 }
 
 void ByteToFloatRow_NEON(const uint8_t* src,
@@ -2664,7 +2665,6 @@ void ByteToFloatRow_NEON(const uint8_t* src,
                          float scale,
                          int width) {
   asm volatile(
-      "vdup.32    q0, %3                         \n"
 
       "1:                                        \n"
       "vld1.8     {d2}, [%0]!                    \n"  // load 8 bytes
@@ -2674,15 +2674,15 @@ void ByteToFloatRow_NEON(const uint8_t* src,
       "vmovl.u16  q3, d3                         \n"
       "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
       "vcvt.f32.u32  q3, q3                      \n"
-      "vmul.f32   q2, q2, d0[0]                  \n"  // scale
-      "vmul.f32   q3, q3, d0[0]                  \n"
+      "vmul.f32   q2, q2, %y3                    \n"  // scale
+      "vmul.f32   q3, q3, %y3                    \n"
       "vst1.8     {q2, q3}, [%1]!                \n"  // store 8 floats
       "bgt        1b                             \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
-      : "r"(scale)   // %3
-      : "cc", "memory", "q0", "q1", "q2", "q3");
+      : "w"(scale)   // %3
+      : "cc", "memory", "q1", "q2", "q3");
 }
 
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..