From 83ca1abe09207daae1628fd8f0d4a0debaef96c6 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Wed, 9 Aug 2017 14:25:38 -0700
Subject: [PATCH] Change ScaleSumSamples to return Sum of Squares

TBR=kjellander@chromium.org
BUG=libyuv:717
TEST=LibYUVPlanarTest.TestScaleSumSamples_Opt

Change-Id: I5208666f3968c5c4b0f1b0c951f24216d78ee3fe
Reviewed-on: https://chromium-review.googlesource.com/607184
Reviewed-by: Cheng Wang <wangcheng@google.com>
---
 BUILD.gn                 |  7 +++---
 README.chromium          |  2 +-
 include/libyuv/version.h |  2 +-
 source/row_common.cc     | 12 +++++-----
 source/row_neon64.cc     | 29 +++++++++++++-----------
 unit_test/planar_test.cc | 48 ++++++++++++++++++----------------------
 6 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/BUILD.gn b/BUILD.gn
index 7f5f26f6a..3eefc3616 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -158,10 +158,11 @@ static_library("libyuv_internal") {
   }
 
   # To enable AVX2 or other cpu optimization, pass flag here
-  # cflags = [ "-mavx2" ]
-  # cflags = [ "-mpopcnt" ]
+  # cflags = [ "-mavx2", "-mpopcnt", "-mavx2", "-mfma" ]
+  if (!is_win) {
+    cflags = [ "-ffp-contract=fast" ]   # Enable fma vectorization for NEON.
+  }
 }
-
 if (libyuv_use_neon) {
   static_library("libyuv_neon") {
     sources = [
diff --git a/README.chromium b/README.chromium
index 13a188fe8..88c7c8660 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1664
+Version: 1665
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 97ced6a7a..4a1d3d1c0 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1664
+#define LIBYUV_VERSION 1665
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/row_common.cc b/source/row_common.cc
index b02aa2b5d..f490a8e3d 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2642,10 +2642,13 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
   float fmax = 0.f;
   int i;
+#if defined(__clang__)
+  #pragma clang loop vectorize_width(4)
+#endif  
   for (i = 0; i < width; ++i) {
-    float v = *src++ * scale;
-    *dst++ = v;
-    fmax = (v > fmax) ? v : fmax;
+    float v = *src++;
+    fmax += v * v;
+    *dst++ = v * scale;
   }
   return fmax;
 }
@@ -2653,8 +2656,7 @@ float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
 void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    float v = *src++ * scale;
-    *dst++ = v;
+    *dst++ = *src++ * scale;
   }
 }
 
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 456c6ea5a..622ff5fbc 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2616,30 +2616,33 @@ float ScaleSumSamples_NEON(const float* src,
                            float* dst,
                            float scale,
                            int width) {
-  float fmax;
+  float fsum;
   asm volatile(
-      "movi       v3.4s, #0                      \n"  // max
-      "movi       v4.4s, #0                      \n"  // max
+      "movi       v5.4s, #0                      \n"  // max
+      "movi       v6.4s, #0                      \n"  // max
 
       "1:                                        \n"
       "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-      "fmul       v1.4s, v1.4s, %4.s[0]          \n"  // scale
-      "fmul       v2.4s, v2.4s, %4.s[0]          \n"  // scale
-      "st1        {v1.4s, v2.4s}, [%1], #32      \n"  // store 8 samples
-      "fmax       v3.4s, v3.4s, v1.4s            \n"  // max
-      "fmax       v4.4s, v4.4s, v2.4s            \n"
+      "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale
+      "fmul       v4.4s, v2.4s, %4.s[0]          \n"
+      "fmla       v5.4s, v1.4s, v1.4s            \n"  // sum of squares
+      "fmla       v6.4s, v2.4s, v2.4s            \n"
+      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
+
       "b.gt       1b                             \n"
-      "fmax       v3.4s, v3.4s, v4.4s            \n"  // max
-      "fmaxv      %s3, v3.4s                     \n"  // signed max acculator
+      "faddp      v5.4s, v5.4s, v6.4s            \n"
+      "faddp      v5.4s, v5.4s, v5.4s            \n"
+      "faddp      v5.4s, v5.4s, v5.4s            \n"
+      "fmov       %w3, s5                        \n"  // sum
 
       : "+r"(src),    // %0
         "+r"(dst),    // %1
         "+r"(width),  // %2
-        "=w"(fmax)    // %3
+        "=w"(fsum)    // %3
       : "w"(scale)    // %4
-      : "cc", "memory", "v1", "v2", "v3", "v4");
-  return fmax;
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+  return fsum;
 }
 
 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index a0d7881ed..2adc6e79c 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2527,7 +2527,7 @@ float TestScaleSumSamples(int benchmark_width,
                           float scale,
                           bool opt) {
   int i, j;
-  float max_c, max_opt;
+  float sum_c, sum_opt = 0.f;
   const int y_plane_size = benchmark_width * benchmark_height * 4;
 
   align_buffer_page_end(orig_y, y_plane_size * 3);
@@ -2542,32 +2542,29 @@ float TestScaleSumSamples(int benchmark_width,
   memset(dst_c, 0, y_plane_size);
   memset(dst_opt, 1, y_plane_size);
 
-  // Disable all optimizations.
-  max_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
+  sum_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
                             reinterpret_cast<float*>(dst_c), scale,
                             benchmark_width * benchmark_height);
 
-  // Enable optimizations.
   for (j = 0; j < benchmark_iterations; j++) {
-#ifdef HAS_SCALESUMSAMPLES_NEON
     if (opt) {
-      max_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y),
+#ifdef HAS_SCALESUMSAMPLES_NEON
+      sum_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y),
                                      reinterpret_cast<float*>(dst_opt), scale,
                                      benchmark_width * benchmark_height);
-
+#else
+      sum_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
+                                  reinterpret_cast<float*>(dst_opt), scale,
+                                  benchmark_width * benchmark_height);
+#endif
     } else {
-      max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
+      sum_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
                                   reinterpret_cast<float*>(dst_opt), scale,
                                   benchmark_width * benchmark_height);
     }
-#else
-    max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
-                                reinterpret_cast<float*>(dst_opt), scale,
-                                benchmark_width * benchmark_height);
-#endif
   }
 
-  float max_diff = 0;
+  float max_diff = FAbs(sum_opt - sum_c);
   for (i = 0; i < y_plane_size / 4; ++i) {
     float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
                           (reinterpret_cast<float*>(dst_opt)[i]));
@@ -2613,32 +2610,29 @@ float TestScaleSamples(int benchmark_width,
   memset(dst_c, 0, y_plane_size);
   memset(dst_opt, 1, y_plane_size);
 
-  // Disable all optimizations.
   ScaleSamples_C(reinterpret_cast<float*>(orig_y),
                  reinterpret_cast<float*>(dst_c), scale,
                  benchmark_width * benchmark_height);
 
-  // Enable optimizations.
   for (j = 0; j < benchmark_iterations; j++) {
-#ifdef HAS_SCALESAMPLES_NEON
     if (opt) {
-      max_opt = ScaleSamples_NEON(reinterpret_cast<float*>(orig_y),
-                                  reinterpret_cast<float*>(dst_opt), scale,
-                                  benchmark_width * benchmark_height);
-
+#ifdef HAS_SCALESUMSAMPLES_NEON
+      ScaleSamples_NEON(reinterpret_cast<float*>(orig_y),
+                        reinterpret_cast<float*>(dst_opt), scale,
+                        benchmark_width * benchmark_height);
+#else
+      ScaleSamples_C(reinterpret_cast<float*>(orig_y),
+                     reinterpret_cast<float*>(dst_opt), scale,
+                     benchmark_width * benchmark_height);
+#endif
     } else {
       ScaleSamples_C(reinterpret_cast<float*>(orig_y),
                      reinterpret_cast<float*>(dst_opt), scale,
                      benchmark_width * benchmark_height);
     }
-#else
-    ScaleSamples_C(reinterpret_cast<float*>(orig_y),
-                   reinterpret_cast<float*>(dst_opt), scale,
-                   benchmark_width * benchmark_height);
-#endif
   }
 
-  float max_diff = 0;
+  float max_diff =0.f;
   for (i = 0; i < y_plane_size / 4; ++i) {
     float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
                           (reinterpret_cast<float*>(dst_opt)[i]));