From 56bbcdf42207008d63f0ae4b9b3b014ed0741d08 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 14 Aug 2017 16:32:58 -0700 Subject: [PATCH] Reintroduce the max version of scale add ScaleMaxSamples_NEON function with max done on original values. TBR=kjellander@chromium.org BUG=libyuv:717 TEST=LibYUVPlanarTest.TestScaleMaxSamples_Opt Change-Id: Id99338860782b10ffd24f66242eb42014c2e229e Reviewed-on: https://chromium-review.googlesource.com/614685 Reviewed-by: Frank Barchard Reviewed-by: Cheng Wang --- README.chromium | 2 +- include/libyuv/row.h | 5 +++ include/libyuv/version.h | 2 +- source/row_common.cc | 18 +++++++++-- source/row_neon64.cc | 30 +++++++++++++++++ unit_test/planar_test.cc | 70 +++++++++++++++++++++++++++++++++++++++- 6 files changed, 121 insertions(+), 6 deletions(-) diff --git a/README.chromium b/README.chromium index 88c7c8660..757e86d0b 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1665 +Version: 1666 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 4fc483f96..164433e6b 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -3178,6 +3178,11 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, const uint8* luma, uint32 lumacoeff); +float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width); +float ScaleMaxSamples_NEON(const float* src, + float* dst, + float scale, + int width); float ScaleSumSamples_C(const float* src, float* dst, float scale, int width); float ScaleSumSamples_NEON(const float* src, float* dst, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 4a1d3d1c0..b9f3d6522 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1665 +#define LIBYUV_VERSION 1666 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_common.cc b/source/row_common.cc index f490a8e3d..c9f71b851 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2640,16 +2640,28 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y, #endif float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { - float fmax = 0.f; + float fsum = 0.f; int i; #if defined(__clang__) #pragma clang loop vectorize_width(4) -#endif +#endif for (i = 0; i < width; ++i) { float v = *src++; - fmax += v * v; + fsum += v * v; *dst++ = v * scale; } + return fsum; +} + +float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) { + float fmax = 0.f; + int i; + for (i = 0; i < width; ++i) { + float v = *src++; + float vs = v * scale; + fmax = (v > fmax) ? v : fmax; + *dst++ = vs; + } return fmax; } diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 622ff5fbc..53248c64b 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2612,6 +2612,36 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { : "cc", "memory", "v1", "v2", "v3"); } +float ScaleMaxSamples_NEON(const float* src, + float* dst, + float scale, + int width) { + float fmax; + asm volatile( + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" // scale + "fmax v5.4s, v5.4s, v1.4s \n" // max + "fmax v6.4s, v6.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "fmax v5.4s, v5.4s, v6.4s \n" // max + "fmaxv %s3, v5.4s \n" // signed max acculator + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width), // %2 + "=w"(fmax) // %3 + : "w"(scale) // %4 + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); + return fmax; +} + float ScaleSumSamples_NEON(const float* src, float* dst, float scale, diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 2adc6e79c..dbae3658b 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2521,6 +2521,74 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) { free_aligned_buffer_page_end(dst_pixels_c); } +float TestScaleMaxSamples(int benchmark_width, + int benchmark_height, + int benchmark_iterations, + float scale, + bool opt) { + int i, j; + float max_c, max_opt = 0.f; + const int y_plane_size = benchmark_width * benchmark_height * 4; + + align_buffer_page_end(orig_y, y_plane_size * 3); + uint8* dst_opt = orig_y + y_plane_size; + uint8* dst_c = orig_y + y_plane_size * 2; + + // Randomize works but may contain some denormals affecting performance. + // MemRandomize(orig_y, y_plane_size); + for (i = 0; i < y_plane_size / 4; ++i) { + (reinterpret_cast(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f; + } + memset(dst_c, 0, y_plane_size); + memset(dst_opt, 1, y_plane_size); + + max_c = ScaleMaxSamples_C(reinterpret_cast(orig_y), + reinterpret_cast(dst_c), scale, + benchmark_width * benchmark_height); + + for (j = 0; j < benchmark_iterations; j++) { + if (opt) { +#ifdef HAS_SCALESUMSAMPLES_NEON + max_opt = ScaleMaxSamples_NEON(reinterpret_cast(orig_y), + reinterpret_cast(dst_opt), scale, + benchmark_width * benchmark_height); +#else + max_opt = ScaleMaxSamples_C(reinterpret_cast(orig_y), + reinterpret_cast(dst_opt), scale, + benchmark_width * benchmark_height); +#endif + } else { + max_opt = ScaleMaxSamples_C(reinterpret_cast(orig_y), + reinterpret_cast(dst_opt), scale, + benchmark_width * benchmark_height); + } + } + + float max_diff = FAbs(max_opt - max_c); + for (i = 0; i < y_plane_size / 4; ++i) { + float abs_diff = FAbs((reinterpret_cast(dst_c)[i]) - + (reinterpret_cast(dst_opt)[i])); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + + free_aligned_buffer_page_end(orig_y); + return max_diff; +} + +TEST_F(LibYUVPlanarTest, TestScaleMaxSamples_C) { + float diff = TestScaleMaxSamples(benchmark_width_, benchmark_height_, + benchmark_iterations_, 1.2f, false); + EXPECT_EQ(0, diff); +} + +TEST_F(LibYUVPlanarTest, TestScaleMaxSamples_Opt) { + float diff = TestScaleMaxSamples(benchmark_width_, benchmark_height_, + benchmark_iterations_, 1.2f, true); + EXPECT_EQ(0, diff); +} + float TestScaleSumSamples(int benchmark_width, int benchmark_height, int benchmark_iterations, @@ -2632,7 +2700,7 @@ float TestScaleSamples(int benchmark_width, } } - float max_diff =0.f; + float max_diff = 0.f; for (i = 0; i < y_plane_size / 4; ++i) { float abs_diff = FAbs((reinterpret_cast(dst_c)[i]) - (reinterpret_cast(dst_opt)[i]));