From 56bbcdf42207008d63f0ae4b9b3b014ed0741d08 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Mon, 14 Aug 2017 16:32:58 -0700
Subject: [PATCH] Reintroduce the max version of scale

add ScaleMaxSamples_NEON function with max
done on original values.

TBR=kjellander@chromium.org
BUG=libyuv:717
TEST=LibYUVPlanarTest.TestScaleMaxSamples_Opt

Change-Id: Id99338860782b10ffd24f66242eb42014c2e229e
Reviewed-on: https://chromium-review.googlesource.com/614685
Reviewed-by: Frank Barchard <fbarchard@google.com>
Reviewed-by: Cheng Wang <wangcheng@google.com>
---
 README.chromium          |  2 +-
 include/libyuv/row.h     |  5 +++
 include/libyuv/version.h |  2 +-
 source/row_common.cc     | 18 +++++++++--
 source/row_neon64.cc     | 30 +++++++++++++++++
 unit_test/planar_test.cc | 70 +++++++++++++++++++++++++++++++++++++++-
 6 files changed, 121 insertions(+), 6 deletions(-)

diff --git a/README.chromium b/README.chromium
index 88c7c8660..757e86d0b 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1665
+Version: 1666
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 4fc483f96..164433e6b 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -3178,6 +3178,11 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
                                  const uint8* luma,
                                  uint32 lumacoeff);
 
+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width);
+float ScaleMaxSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width);
 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);
 float ScaleSumSamples_NEON(const float* src,
                            float* dst,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 4a1d3d1c0..b9f3d6522 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1665
+#define LIBYUV_VERSION 1666
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/row_common.cc b/source/row_common.cc
index f490a8e3d..c9f71b851 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2640,16 +2640,28 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
 #endif
 
 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
-  float fmax = 0.f;
+  float fsum = 0.f;
   int i;
 #if defined(__clang__)
   #pragma clang loop vectorize_width(4)
-#endif  
+#endif
   for (i = 0; i < width; ++i) {
     float v = *src++;
-    fmax += v * v;
+    fsum += v * v;
     *dst++ = v * scale;
   }
+  return fsum;
+}
+
+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) {
+  float fmax = 0.f;
+  int i;
+  for (i = 0; i < width; ++i) {
+    float v = *src++;
+    float vs = v * scale;
+    fmax = (v > fmax) ? v : fmax;
+    *dst++ = vs;
+  }
   return fmax;
 }
 
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 622ff5fbc..53248c64b 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2612,6 +2612,36 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
       : "cc", "memory", "v1", "v2", "v3");
 }
 
+float ScaleMaxSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width) {
+  float fmax;
+  asm volatile(
+      "movi       v5.4s, #0                      \n"  // max
+      "movi       v6.4s, #0                      \n"
+
+      "1:                                        \n"
+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale
+      "fmul       v4.4s, v2.4s, %4.s[0]          \n"  // scale
+      "fmax       v5.4s, v5.4s, v1.4s            \n"  // max
+      "fmax       v6.4s, v6.4s, v2.4s            \n"
+      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      "fmax       v5.4s, v5.4s, v6.4s            \n"  // max
+      "fmaxv      %s3, v5.4s                     \n"  // signed max acculator
+
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width),  // %2
+        "=w"(fmax)    // %3
+      : "w"(scale)    // %4
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+  return fmax;
+}
+
 float ScaleSumSamples_NEON(const float* src,
                            float* dst,
                            float scale,
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 2adc6e79c..dbae3658b 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2521,6 +2521,74 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
   free_aligned_buffer_page_end(dst_pixels_c);
 }
 
+float TestScaleMaxSamples(int benchmark_width,
+                          int benchmark_height,
+                          int benchmark_iterations,
+                          float scale,
+                          bool opt) {
+  int i, j;
+  float max_c, max_opt = 0.f;
+  const int y_plane_size = benchmark_width * benchmark_height * 4;
+
+  align_buffer_page_end(orig_y, y_plane_size * 3);
+  uint8* dst_opt = orig_y + y_plane_size;
+  uint8* dst_c = orig_y + y_plane_size * 2;
+
+  // Randomize works but may contain some denormals affecting performance.
+  // MemRandomize(orig_y, y_plane_size);
+  for (i = 0; i < y_plane_size / 4; ++i) {
+    (reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f;
+  }
+  memset(dst_c, 0, y_plane_size);
+  memset(dst_opt, 1, y_plane_size);
+
+  max_c = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
+                            reinterpret_cast<float*>(dst_c), scale,
+                            benchmark_width * benchmark_height);
+
+  for (j = 0; j < benchmark_iterations; j++) {
+    if (opt) {
+#ifdef HAS_SCALESUMSAMPLES_NEON
+      max_opt = ScaleMaxSamples_NEON(reinterpret_cast<float*>(orig_y),
+                                     reinterpret_cast<float*>(dst_opt), scale,
+                                     benchmark_width * benchmark_height);
+#else
+      max_opt = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
+                                  reinterpret_cast<float*>(dst_opt), scale,
+                                  benchmark_width * benchmark_height);
+#endif
+    } else {
+      max_opt = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
+                                  reinterpret_cast<float*>(dst_opt), scale,
+                                  benchmark_width * benchmark_height);
+    }
+  }
+
+  float max_diff = FAbs(max_opt - max_c);
+  for (i = 0; i < y_plane_size / 4; ++i) {
+    float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
+                          (reinterpret_cast<float*>(dst_opt)[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+
+  free_aligned_buffer_page_end(orig_y);
+  return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, TestScaleMaxSamples_C) {
+  float diff = TestScaleMaxSamples(benchmark_width_, benchmark_height_,
+                                   benchmark_iterations_, 1.2f, false);
+  EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestScaleMaxSamples_Opt) {
+  float diff = TestScaleMaxSamples(benchmark_width_, benchmark_height_,
+                                   benchmark_iterations_, 1.2f, true);
+  EXPECT_EQ(0, diff);
+}
+
 float TestScaleSumSamples(int benchmark_width,
                           int benchmark_height,
                           int benchmark_iterations,
@@ -2632,7 +2700,7 @@ float TestScaleSamples(int benchmark_width,
     }
   }
 
-  float max_diff =0.f;
+  float max_diff = 0.f;
   for (i = 0; i < y_plane_size / 4; ++i) {
     float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
                           (reinterpret_cast<float*>(dst_opt)[i]));