diff --git a/BUILD.gn b/BUILD.gn index 01b023ee6..04bf80fff 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -122,6 +122,10 @@ static_library("libyuv") { # Enable optimize for speed (-O2) over size (-Os). configs += [ "//build/config/compiler:optimize_max" ] } + + # To enable AVX2 or other cpu optimization, pass flag here + # cflags = [ "-mavx2" ] + } if (libyuv_use_neon) { @@ -140,6 +144,14 @@ if (libyuv_use_neon) { public_configs = [ ":libyuv_config" ] + # Always enable optimization for Release and NaCl builds (to workaround + # crbug.com/538243). + if (!is_debug) { + configs -= [ "//build/config/compiler:default_optimization" ] + # Enable optimize for speed (-O2) over size (-Os). + configs += [ "//build/config/compiler:optimize_max" ] + } + if (current_cpu != "arm64") { configs -= [ "//build/config/compiler:compiler_arm_fpu" ] cflags = [ "-mfpu=neon" ] diff --git a/README.chromium b/README.chromium index 22c4937e3..159826180 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1627 +Version: 1628 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index ef3efa5f3..96861befb 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -201,7 +201,7 @@ extern "C" { #define HAS_COPYROW_AVX #define HAS_H422TOARGBROW_AVX2 #define HAS_HALFFLOATROW_AVX2 -// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast +// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast #define HAS_I400TOARGBROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 @@ -330,6 +330,11 @@ extern "C" { #define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOYROW_NEON +// TODO(fbarchard): Port to 32 bit. +#if defined(__aarch64__) +#define HAS_HALFFLOATROW_NEON +#endif + // Effects: #define HAS_ARGBADDROW_NEON #define HAS_ARGBATTENUATEROW_NEON @@ -1954,6 +1959,9 @@ void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale, void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width); void HalfFloatRow_Any_F16C(const uint16* src, uint16* dst, float scale, int width); +void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width); +void HalfFloatRow_Any_NEON(const uint16* src, uint16* dst, float scale, + int width); void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, const uint8* luma, uint32 lumacoeff); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 42835951f..cebc731db 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1627 +#define LIBYUV_VERSION 1628 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index da4c47b83..143ae869d 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2585,6 +2585,15 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y, } } #endif +#if defined(HAS_HALFFLOATROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + HalfFloatRow = HalfFloatRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + HalfFloatRow = HalfFloatRow_NEON; + } + } +#endif + for (y = 0; y < height; ++y) { HalfFloatRow(src_y, dst_y, scale, width); src_y += src_stride_y; diff --git a/source/row_any.cc b/source/row_any.cc index dbed89189..ec0aa21d7 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -585,6 +585,9 @@ ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15) #ifdef HAS_HALFFLOATROW_F16C ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15) #endif +#ifdef HAS_HALFFLOATROW_NEON +ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 1, 1, 7) +#endif #undef ANY11P16 // Any 1 to 1 with yuvconstants diff --git a/source/row_neon64.cc b/source/row_neon64.cc index d62762dab..3d122680e 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2710,6 +2710,32 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } + +void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v1.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v1.4s, v1.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent + "fmul v1.4s, v1.4s, %3.s[0] \n" + "uqshrn v4.4h, v2.4s, #13 \n" // isolate halffloat + "uqshrn2 v4.8h, v1.4s, #13 \n" + MEMACCESS(1) + "st1 {v4.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "v1", "v2", "v4" + ); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 722074f73..c552c4a59 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2081,9 +2081,12 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) { } } -TEST_F(LibYUVPlanarTest, TestHalfFloatPlane) { +int TestHalfFloatPlane(int benchmark_width, int benchmark_height, + int benchmark_iterations, + int disable_cpu_flags, int benchmark_cpu_info, + float scale) { int i, j; - const int y_plane_size = benchmark_width_ * benchmark_height_ * 2; + const int y_plane_size = benchmark_width * benchmark_height * 2; align_buffer_page_end(orig_y, y_plane_size); align_buffer_page_end(dst_c, y_plane_size); @@ -2093,32 +2096,62 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane) { memset(dst_opt, 1, y_plane_size); // Disable all optimizations. - MaskCpuFlags(disable_cpu_flags_); + MaskCpuFlags(disable_cpu_flags); double c_time = get_time(); - for (j = 0; j < benchmark_iterations_; j++) { - HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2, - (uint16*)dst_c, benchmark_width_ * 2, - 1.0f / 4096.0f, benchmark_width_, benchmark_height_); + for (j = 0; j < benchmark_iterations; j++) { + HalfFloatPlane((uint16*)orig_y, benchmark_width * 2, + (uint16*)dst_c, benchmark_width * 2, + scale, benchmark_width, benchmark_height); } - c_time = (get_time() - c_time) / benchmark_iterations_; + c_time = (get_time() - c_time) / benchmark_iterations; // Enable optimizations. - MaskCpuFlags(benchmark_cpu_info_); + MaskCpuFlags(benchmark_cpu_info); double opt_time = get_time(); - for (j = 0; j < benchmark_iterations_; j++) { - HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2, - (uint16*)dst_opt, benchmark_width_ * 2, - 1.0f / 4096.0f, benchmark_width_, benchmark_height_); + for (j = 0; j < benchmark_iterations; j++) { + HalfFloatPlane((uint16*)orig_y, benchmark_width * 2, + (uint16*)dst_opt, benchmark_width * 2, + scale, benchmark_width, benchmark_height); } - opt_time = (get_time() - opt_time) / benchmark_iterations_; + opt_time = (get_time() - opt_time) / benchmark_iterations; + int diff = 0; for (i = 0; i < y_plane_size; ++i) { - EXPECT_EQ(dst_c[i], dst_opt[i]); + diff = dst_c[i] - dst_opt[i]; + if (diff) break; } free_aligned_buffer_page_end(orig_y); free_aligned_buffer_page_end(dst_c); free_aligned_buffer_page_end(dst_opt); + return diff; +} + +// 5 bit exponent with bias of 15 will underflow to a denormal if scale causes +// exponent to be less than 0. 15 - log2(65536) = -1/ This shouldnt normally +// happen since scale is 1/(1<