diff --git a/README.chromium b/README.chromium
index c56327178..c74afb92d 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 217
+Version: 218
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index e99ad6491..f5cb202de 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 217
+#define LIBYUV_VERSION 218
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
 
diff --git a/source/compare.cc b/source/compare.cc
index c57a59162..1da3dc4ab 100644
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -39,8 +39,8 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
 #define HAS_SUMSQUAREERROR_NEON
 
-static uint32 SumSquareError_NEON(const uint8* src_a,
-                                  const uint8* src_b, int count) {
+static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
+                                  int count) {
   volatile uint32 sse;
   asm volatile (
     "vmov.u8    q7, #0                         \n"
@@ -79,8 +79,8 @@ static uint32 SumSquareError_NEON(const uint8* src_a,
 #elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
 #define HAS_SUMSQUAREERROR_SSE2
 __declspec(naked)
-static uint32 SumSquareError_SSE2(const uint8* src_a,
-                                  const uint8* src_b, int count) {
+static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
+                                  int count) {
   __asm {
     mov        eax, [esp + 4]    // src_a
     mov        edx, [esp + 8]    // src_b
@@ -119,8 +119,8 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
 
 #elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
 #define HAS_SUMSQUAREERROR_SSE2
-static uint32 SumSquareError_SSE2(const uint8* src_a,
-                                  const uint8* src_b, int count) {
+static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
+                                  int count) {
   uint32 sse;
   asm volatile (
     "pxor      %%xmm0,%%xmm0                   \n"
@@ -165,8 +165,8 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
 }
 #endif
 
-static uint32 SumSquareError_C(const uint8* src_a,
-                               const uint8* src_b, int count) {
+static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b,
+                               int count) {
   uint32 sse = 0u;
   for (int x = 0; x < count; ++x) {
     int diff = src_a[0] - src_b[0];
@@ -177,23 +177,20 @@ static uint32 SumSquareError_C(const uint8* src_a,
   return sse;
 }
 
-uint64 ComputeSumSquareError(const uint8* src_a,
-                             const uint8* src_b, int count) {
-  uint32 (*SumSquareError)(const uint8* src_a,
-                           const uint8* src_b, int count);
+uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
+                             int count) {
+  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+      SumSquareError_C;
 #if defined(HAS_SUMSQUAREERROR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     SumSquareError = SumSquareError_NEON;
-  } else
+  }
 #elif defined(HAS_SUMSQUAREERROR_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) &&
       IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
     SumSquareError = SumSquareError_SSE2;
-  } else
-#endif
-  {
-    SumSquareError = SumSquareError_C;
   }
+#endif
   // 32K values will fit a 32bit int return value from SumSquareError.
   // After each block of 32K, accumulate into 64 bit int.
   const int kBlockSize = 1 << 15;  // 32768;
@@ -222,17 +219,18 @@ uint64 ComputeSumSquareError(const uint8* src_a,
 uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
                                   const uint8* src_b, int stride_b,
                                   int width, int height) {
-  uint32 (*SumSquareError)(const uint8* src_a,
-                           const uint8* src_b, int count);
+  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+      SumSquareError_C;
 #if defined(HAS_SUMSQUAREERROR_NEON)
-  if (TestCpuFlag(kCpuHasNEON) &&
-      IS_ALIGNED(width, 16)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     SumSquareError = SumSquareError_NEON;
-  } else
-#endif
-  {
-    SumSquareError = SumSquareError_C;
   }
+#elif defined(HAS_SUMSQUAREERROR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+    SumSquareError = SumSquareError_SSE2;
+  }
+#endif
 
   uint64 sse = 0;
   for (int h = 0; h < height; ++h) {
diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc
index 4430b0e06..52730e249 100644
--- a/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
@@ -152,6 +152,9 @@ TEST_F(libyuvTest, BenchmarkPsnr_C) {
                   src_b, _benchmark_width,
                   _benchmark_width, _benchmark_height);
 
+  c_time = (get_time() - c_time) / _benchmark_iterations;
+  printf ("BenchmarkPsnr_C - %8d us c\n", (int)(c_time*1e6));
+
   MaskCpuFlags(-1);
 
   EXPECT_EQ(0, 0);
@@ -164,15 +167,16 @@ TEST_F(libyuvTest, BenchmarkPsnr_OPT) {
   align_buffer_16(src_a, _benchmark_width * _benchmark_height)
   align_buffer_16(src_b, _benchmark_width * _benchmark_height)
 
-  MaskCpuFlags(kCpuInitialized);
+  MaskCpuFlags(-1);
 
-  double c_time = get_time();
+  double opt_time = get_time();
   for (int i = 0; i < _benchmark_iterations; ++i)
     CalcFramePsnr(src_a, _benchmark_width,
                   src_b, _benchmark_width,
                   _benchmark_width, _benchmark_height);
 
-  MaskCpuFlags(-1);
+  opt_time = (get_time() - opt_time) / _benchmark_iterations;
+  printf ("BenchmarkPsnr_OPT - %8d us opt\n", (int)(opt_time*1e6));
 
   EXPECT_EQ(0, 0);
 
@@ -269,6 +273,9 @@ TEST_F(libyuvTest, BenchmarkSsim_C) {
                   src_b, _benchmark_width,
                   _benchmark_width, _benchmark_height);
 
+  c_time = (get_time() - c_time) / _benchmark_iterations;
+  printf ("BenchmarkSsim_C - %8d us c\n", (int)(c_time*1e6));
+
   MaskCpuFlags(-1);
 
   EXPECT_EQ(0, 0);
@@ -281,15 +288,16 @@ TEST_F(libyuvTest, BenchmarkSsim_OPT) {
   align_buffer_16(src_a, _benchmark_width * _benchmark_height)
   align_buffer_16(src_b, _benchmark_width * _benchmark_height)
 
-  MaskCpuFlags(kCpuInitialized);
+  MaskCpuFlags(-1);
 
-  double c_time = get_time();
+  double opt_time = get_time();
   for (int i = 0; i < _benchmark_iterations; ++i)
     CalcFrameSsim(src_a, _benchmark_width,
                   src_b, _benchmark_width,
                   _benchmark_width, _benchmark_height);
 
-  MaskCpuFlags(-1);
+  opt_time = (get_time() - opt_time) / _benchmark_iterations;
+  printf ("BenchmarkPsnr_OPT - %8d us opt\n", (int)(opt_time*1e6));
 
   EXPECT_EQ(0, 0);
 
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 46ab385e6..2e0135e98 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -95,7 +95,6 @@ TEST_F (libyuvTest, I420To##FMT##_CvsOPT) {                                    \
               dst_rgb_opt, src_width << 2,                                     \
               src_width, src_height);                                          \
   int err = 0;                                                                 \
-  int i = 0;                                                                   \
   for (int i = 0; i < src_height; ++i) {                                       \
     for (int j = 0; j < src_width << 2; ++j) {                                 \
       int diff = (int)(dst_rgb_c[i * src_height + j]) -                        \