From 83ca1abe09207daae1628fd8f0d4a0debaef96c6 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 9 Aug 2017 14:25:38 -0700 Subject: [PATCH] Change ScaleSumSamples to return Sum of Squares TBR=kjellander@chromium.org BUG=libyuv:717 TEST=LibYUVPlanarTest.TestScaleSumSamples_Opt Change-Id: I5208666f3968c5c4b0f1b0c951f24216d78ee3fe Reviewed-on: https://chromium-review.googlesource.com/607184 Reviewed-by: Cheng Wang --- BUILD.gn | 7 +++--- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/row_common.cc | 12 +++++----- source/row_neon64.cc | 29 +++++++++++++----------- unit_test/planar_test.cc | 48 ++++++++++++++++++---------------------- 6 files changed, 50 insertions(+), 50 deletions(-) diff --git a/BUILD.gn b/BUILD.gn index 7f5f26f6a..3eefc3616 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -158,10 +158,11 @@ static_library("libyuv_internal") { } # To enable AVX2 or other cpu optimization, pass flag here - # cflags = [ "-mavx2" ] - # cflags = [ "-mpopcnt" ] + # cflags = [ "-mavx2", "-mpopcnt", "-mavx2", "-mfma" ] + if (!is_win) { + cflags = [ "-ffp-contract=fast" ] # Enable fma vectorization for NEON. + } } - if (libyuv_use_neon) { static_library("libyuv_neon") { sources = [ diff --git a/README.chromium b/README.chromium index 13a188fe8..88c7c8660 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1664 +Version: 1665 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 97ced6a7a..4a1d3d1c0 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1664 +#define LIBYUV_VERSION 1665 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_common.cc b/source/row_common.cc index b02aa2b5d..f490a8e3d 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2642,10 +2642,13 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y, float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { float fmax = 0.f; int i; +#if defined(__clang__) + #pragma clang loop vectorize_width(4) +#endif for (i = 0; i < width; ++i) { - float v = *src++ * scale; - *dst++ = v; - fmax = (v > fmax) ? v : fmax; + float v = *src++; + fmax += v * v; + *dst++ = v * scale; } return fmax; } @@ -2653,8 +2656,7 @@ float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { void ScaleSamples_C(const float* src, float* dst, float scale, int width) { int i; for (i = 0; i < width; ++i) { - float v = *src++ * scale; - *dst++ = v; + *dst++ = *src++ * scale; } } diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 456c6ea5a..622ff5fbc 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2616,30 +2616,33 @@ float ScaleSumSamples_NEON(const float* src, float* dst, float scale, int width) { - float fmax; + float fsum; asm volatile( - "movi v3.4s, #0 \n" // max - "movi v4.4s, #0 \n" // max + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" // max "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples "subs %w2, %w2, #8 \n" // 8 processed per loop - "fmul v1.4s, v1.4s, %4.s[0] \n" // scale - "fmul v2.4s, v2.4s, %4.s[0] \n" // scale - "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples - "fmax v3.4s, v3.4s, v1.4s \n" // max - "fmax v4.4s, v4.4s, v2.4s \n" + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" + "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares + "fmla v6.4s, v2.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" - "fmax v3.4s, v3.4s, v4.4s \n" // max - "fmaxv %s3, v3.4s \n" // signed max acculator + "faddp v5.4s, v5.4s, v6.4s \n" + "faddp v5.4s, v5.4s, v5.4s \n" + "faddp v5.4s, v5.4s, v5.4s \n" + "fmov %w3, s5 \n" // sum : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width), // %2 - "=w"(fmax) // %3 + "=w"(fsum) // %3 : "w"(scale) // %4 - : "cc", "memory", "v1", "v2", "v3", "v4"); - return fmax; + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); + return fsum; } void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index a0d7881ed..2adc6e79c 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2527,7 +2527,7 @@ float TestScaleSumSamples(int benchmark_width, float scale, bool opt) { int i, j; - float max_c, max_opt; + float sum_c, sum_opt = 0.f; const int y_plane_size = benchmark_width * benchmark_height * 4; align_buffer_page_end(orig_y, y_plane_size * 3); @@ -2542,32 +2542,29 @@ float TestScaleSumSamples(int benchmark_width, memset(dst_c, 0, y_plane_size); memset(dst_opt, 1, y_plane_size); - // Disable all optimizations. - max_c = ScaleSumSamples_C(reinterpret_cast(orig_y), + sum_c = ScaleSumSamples_C(reinterpret_cast(orig_y), reinterpret_cast(dst_c), scale, benchmark_width * benchmark_height); - // Enable optimizations. for (j = 0; j < benchmark_iterations; j++) { -#ifdef HAS_SCALESUMSAMPLES_NEON if (opt) { - max_opt = ScaleSumSamples_NEON(reinterpret_cast(orig_y), +#ifdef HAS_SCALESUMSAMPLES_NEON + sum_opt = ScaleSumSamples_NEON(reinterpret_cast(orig_y), reinterpret_cast(dst_opt), scale, benchmark_width * benchmark_height); - +#else + sum_opt = ScaleSumSamples_C(reinterpret_cast(orig_y), + reinterpret_cast(dst_opt), scale, + benchmark_width * benchmark_height); +#endif } else { - max_opt = ScaleSumSamples_C(reinterpret_cast(orig_y), + sum_opt = ScaleSumSamples_C(reinterpret_cast(orig_y), reinterpret_cast(dst_opt), scale, benchmark_width * benchmark_height); } -#else - max_opt = ScaleSumSamples_C(reinterpret_cast(orig_y), - reinterpret_cast(dst_opt), scale, - benchmark_width * benchmark_height); -#endif } - float max_diff = 0; + float max_diff = FAbs(sum_opt - sum_c); for (i = 0; i < y_plane_size / 4; ++i) { float abs_diff = FAbs((reinterpret_cast(dst_c)[i]) - (reinterpret_cast(dst_opt)[i])); @@ -2613,32 +2610,29 @@ float TestScaleSamples(int benchmark_width, memset(dst_c, 0, y_plane_size); memset(dst_opt, 1, y_plane_size); - // Disable all optimizations. ScaleSamples_C(reinterpret_cast(orig_y), reinterpret_cast(dst_c), scale, benchmark_width * benchmark_height); - // Enable optimizations. for (j = 0; j < benchmark_iterations; j++) { -#ifdef HAS_SCALESAMPLES_NEON if (opt) { - max_opt = ScaleSamples_NEON(reinterpret_cast(orig_y), - reinterpret_cast(dst_opt), scale, - benchmark_width * benchmark_height); - +#ifdef HAS_SCALESUMSAMPLES_NEON + ScaleSamples_NEON(reinterpret_cast(orig_y), + reinterpret_cast(dst_opt), scale, + benchmark_width * benchmark_height); +#else + ScaleSamples_C(reinterpret_cast(orig_y), + reinterpret_cast(dst_opt), scale, + benchmark_width * benchmark_height); +#endif } else { ScaleSamples_C(reinterpret_cast(orig_y), reinterpret_cast(dst_opt), scale, benchmark_width * benchmark_height); } -#else - ScaleSamples_C(reinterpret_cast(orig_y), - reinterpret_cast(dst_opt), scale, - benchmark_width * benchmark_height); -#endif } - float max_diff = 0; + float max_diff =0.f; for (i = 0; i < y_plane_size / 4; ++i) { float abs_diff = FAbs((reinterpret_cast(dst_c)[i]) - (reinterpret_cast(dst_opt)[i]));