From 38613a39f9eb3763a68fe56f8acf22474ed9c5ed Mon Sep 17 00:00:00 2001 From: Maya Warrier Date: Wed, 17 May 2023 01:34:33 -0400 Subject: [PATCH] Fix perf decrease when UC = char --- include/fast_float/ascii_number.h | 80 +++++++++++++++---------------- include/fast_float/float_common.h | 2 +- 2 files changed, 39 insertions(+), 43 deletions(-) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index 39f2a07..cc0af11 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -17,7 +17,7 @@ namespace fast_float { template -fastfloat_really_inline constexpr bool has_simd_opts() { +fastfloat_really_inline constexpr bool has_simd_opt() { #ifdef FASTFLOAT_HAS_SIMD return std::is_same::value; #else @@ -68,18 +68,7 @@ uint64_t read8_to_u64(const UC *chars) { fastfloat_really_inline uint64_t simd_read8_to_u64(const __m128i data) { -FASTFLOAT_SIMD_DISABLE_WARNINGS - static const char16_t kmasks[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; - const __m128i masks = _mm_loadu_si128(reinterpret_cast(kmasks)); - - // todo: with AVX512BW + AVX512VL, can use cvtepi16_epi8 instead of masking + pack - __m128i masked = _mm_and_si128(data, masks); - __m128i packed = _mm_packus_epi16(masked, masked); - - uint64_t val; - _mm_storeu_si64(&val, packed); - return val; -FASTFLOAT_SIMD_RESTORE_WARNINGS + return _mm_cvtsi128_si64x(_mm_packus_epi16(data, data)); } fastfloat_really_inline @@ -92,7 +81,7 @@ FASTFLOAT_SIMD_RESTORE_WARNINGS #endif // dummy for compile -template ())> +template ())> uint64_t simd_read8_to_u64(UC const*) { return 0; } @@ -132,7 +121,7 @@ uint32_t parse_eight_digits_unrolled(uint64_t val) { template fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint32_t parse_eight_digits_unrolled(UC const * chars) noexcept { - if (cpp20_and_in_constexpr() || !has_simd_opts()) { + if (cpp20_and_in_constexpr() || !has_simd_opt()) { return parse_eight_digits_unrolled(read8_to_u64(chars)); // truncation okay } return parse_eight_digits_unrolled(simd_read8_to_u64(chars)); @@ -145,28 +134,18 @@ fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val 0x8080808080808080)); } -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -bool parse_if_eight_digits_unrolled(const char* chars, uint64_t& i) noexcept { - if (is_made_of_eight_digits_fast(read8_to_u64(chars))) { - i = i * 100000000 + parse_eight_digits_unrolled(read8_to_u64(chars)); - return true; - } - else return false; -} + +#ifdef FASTFLOAT_HAS_SIMD // Call this if chars might not be 8 digits. // Using this style (instead of is_made_of_eight_digits_fast() then parse_eight_digits_unrolled()) -// ensures we don't load SIMD registers twice if we don't have to. -// -// Benchmark: -// https://quick-bench.com/q/Bbn0B4WmZsdgS3qDZWpggAY-jgs -// +// ensures we don't load SIMD registers twice. fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -bool parse_if_eight_digits_unrolled(const char16_t* chars, uint64_t& i) noexcept { -#ifdef FASTFLOAT_SSE2 +bool simd_parse_if_eight_digits_unrolled(const char16_t* chars, uint64_t& i) noexcept { if (cpp20_and_in_constexpr()) { return false; - } + } +#ifdef FASTFLOAT_SSE2 FASTFLOAT_SIMD_DISABLE_WARNINGS const __m128i data = _mm_loadu_si128(reinterpret_cast(chars)); @@ -181,18 +160,36 @@ FASTFLOAT_SIMD_DISABLE_WARNINGS } else return false; FASTFLOAT_SIMD_RESTORE_WARNINGS - -#else // No SIMD available - - (void)chars; (void)i; // unused - return false; #endif } -// todo, no simd optimization yet +#endif + +// dummy for compile +template ())> +uint64_t simd_parse_if_eight_digits_unrolled(UC const*, uint64_t&) { + return 0; +} + + +template ::value)> fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -bool parse_if_eight_digits_unrolled(const char32_t*, uint64_t&) noexcept { - return false; +void loop_parse_if_eight_digits(const UC*& p, const UC* const pend, uint64_t& i) { + if (!has_simd_opt()) { + return; + } + while ((std::distance(p, pend) >= 8) && simd_parse_if_eight_digits_unrolled(p, i)) { // in rare cases, this will overflow, but that's ok + p += 8; + } +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +void loop_parse_if_eight_digits(const char*& p, const char* const pend, uint64_t& i) { + // optimizes better than parse_if_eight_digits_unrolled() for UC = char. + while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(read8_to_u64(p))) { + i = i * 100000000 + parse_eight_digits_unrolled(read8_to_u64(p)); // in rare cases, this will overflow, but that's ok + p += 8; + } } template @@ -256,9 +253,8 @@ parsed_number_string_t parse_number_string(UC const *p, UC const * pend, par UC const * before = p; // can occur at most twice without overflowing, but let it occur more, since // for integers with many digits, digit parsing is the primary bottleneck. - while ((std::distance(p, pend) >= 8) && parse_if_eight_digits_unrolled(p, i)) { // in rare cases, this will overflow, but that's ok - p += 8; - } + loop_parse_if_eight_digits(p, pend, i); + while ((p != pend) && is_integer(*p)) { uint8_t digit = uint8_t(*p - UC('0')); ++p; diff --git a/include/fast_float/float_common.h b/include/fast_float/float_common.h index 201e72f..80b022e 100644 --- a/include/fast_float/float_common.h +++ b/include/fast_float/float_common.h @@ -157,7 +157,7 @@ using parse_options = parse_options_t; // rust style `try!()` macro, or `?` operator #define FASTFLOAT_TRY(x) { if (!(x)) return false; } -#define FASTFLOAT_ENABLE_IF(test) typename std::enable_if<(test), int>::type = 0 +#define FASTFLOAT_ENABLE_IF(...) typename std::enable_if<(__VA_ARGS__), int>::type = 0 namespace fast_float {