diff --git a/.gitignore b/.gitignore index c9e8138..75dc2e8 100644 --- a/.gitignore +++ b/.gitignore @@ -3,10 +3,11 @@ Testing/* .cache/ compile_commands.json -# Visual Studio +# Visual studio .vs/ Debug/ Release/ +/out/ *.sln *.vcxproj *.vcxproj.filters diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 03e02b4..e339869 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -5,4 +5,5 @@ Neal Richardson Tim Paine Fabio Pellacini Lénárd Szolnoki -Jan Pharago \ No newline at end of file +Jan Pharago +Maya Warrier \ No newline at end of file diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index d506326..481b91d 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -5,11 +5,26 @@ #include #include #include +#include #include "float_common.h" +#ifdef FASTFLOAT_SSE2 +#include +#endif + + namespace fast_float { +template +fastfloat_really_inline constexpr bool has_simd_opt() { +#ifdef FASTFLOAT_HAS_SIMD + return std::is_same::value; +#else + return false; +#endif +} + // Next function can be micro-optimized, but compilers are entirely // able to optimize it well. template @@ -28,12 +43,14 @@ fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) { | (val & 0x00000000000000FF) << 56; } +// Read 8 UC into a u64. Truncates UC if not char. +template fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -uint64_t read_u64(const char *chars) { - if (cpp20_and_in_constexpr()) { +uint64_t read8_to_u64(const UC *chars) { + if (cpp20_and_in_constexpr() || !std::is_same::value) { uint64_t val = 0; for(int i = 0; i < 8; ++i) { - val |= uint64_t(*chars) << (i*8); + val |= uint64_t(uint8_t(*chars)) << (i*8); ++chars; } return val; @@ -47,6 +64,39 @@ uint64_t read_u64(const char *chars) { return val; } +#ifdef FASTFLOAT_SSE2 + +fastfloat_really_inline +uint64_t simd_read8_to_u64(const __m128i data) { +FASTFLOAT_SIMD_DISABLE_WARNINGS + const __m128i packed = _mm_packus_epi16(data, data); +#ifdef FASTFLOAT_64BIT + return uint64_t(_mm_cvtsi128_si64(packed)); +#else + uint64_t value; + // Visual Studio + older versions of GCC don't support _mm_storeu_si64 + _mm_storel_epi64(reinterpret_cast<__m128i*>(&value), packed); + return value; +#endif +FASTFLOAT_SIMD_RESTORE_WARNINGS +} + +fastfloat_really_inline +uint64_t simd_read8_to_u64(const char16_t* chars) { +FASTFLOAT_SIMD_DISABLE_WARNINGS + return simd_read8_to_u64(_mm_loadu_si128(reinterpret_cast(chars))); +FASTFLOAT_SIMD_RESTORE_WARNINGS +} + +#endif + +// dummy for compile +template ())> +uint64_t simd_read8_to_u64(UC const*) { + return 0; +} + + fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void write_u64(uint8_t *chars, uint64_t val) { if (cpp20_and_in_constexpr()) { @@ -76,40 +126,80 @@ uint32_t parse_eight_digits_unrolled(uint64_t val) { return uint32_t(val); } -fastfloat_really_inline constexpr -uint32_t parse_eight_digits_unrolled(const char16_t *) noexcept { - return 0; -} - -fastfloat_really_inline constexpr -uint32_t parse_eight_digits_unrolled(const char32_t *) noexcept { - return 0; -} +// Call this if chars are definitely 8 digits. +template fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -uint32_t parse_eight_digits_unrolled(const char *chars) noexcept { - return parse_eight_digits_unrolled(read_u64(chars)); +uint32_t parse_eight_digits_unrolled(UC const * chars) noexcept { + if (cpp20_and_in_constexpr() || !has_simd_opt()) { + return parse_eight_digits_unrolled(read8_to_u64(chars)); // truncation okay + } + return parse_eight_digits_unrolled(simd_read8_to_u64(chars)); } + // credit @aqrit -fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val) noexcept { +fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val) noexcept { return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) & 0x8080808080808080)); } -fastfloat_really_inline constexpr -bool is_made_of_eight_digits_fast(const char16_t *) noexcept { - return false; + +#ifdef FASTFLOAT_HAS_SIMD + +// Call this if chars might not be 8 digits. +// Using this style (instead of is_made_of_eight_digits_fast() then parse_eight_digits_unrolled()) +// ensures we don't load SIMD registers twice. +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +bool simd_parse_if_eight_digits_unrolled(const char16_t* chars, uint64_t& i) noexcept { + if (cpp20_and_in_constexpr()) { + return false; + } +#ifdef FASTFLOAT_SSE2 +FASTFLOAT_SIMD_DISABLE_WARNINGS + const __m128i data = _mm_loadu_si128(reinterpret_cast(chars)); + + // (x - '0') <= 9 + // http://0x80.pl/articles/simd-parsing-int-sequences.html + const __m128i t0 = _mm_add_epi16(data, _mm_set1_epi16(32720)); + const __m128i t1 = _mm_cmpgt_epi16(t0, _mm_set1_epi16(-32759)); + + if (_mm_movemask_epi8(t1) == 0) { + i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data)); + return true; + } + else return false; +FASTFLOAT_SIMD_RESTORE_WARNINGS +#endif } -fastfloat_really_inline constexpr -bool is_made_of_eight_digits_fast(const char32_t *) noexcept { - return false; +#endif + +// dummy for compile +template ())> +uint64_t simd_parse_if_eight_digits_unrolled(UC const*, uint64_t&) { + return 0; +} + + +template ::value)> +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +void loop_parse_if_eight_digits(const UC*& p, const UC* const pend, uint64_t& i) { + if (!has_simd_opt()) { + return; + } + while ((std::distance(p, pend) >= 8) && simd_parse_if_eight_digits_unrolled(p, i)) { // in rare cases, this will overflow, but that's ok + p += 8; + } } fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -bool is_made_of_eight_digits_fast(const char *chars) noexcept { - return is_made_of_eight_digits_fast(read_u64(chars)); +void loop_parse_if_eight_digits(const char*& p, const char* const pend, uint64_t& i) { + // optimizes better than parse_if_eight_digits_unrolled() for UC = char. + while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(read8_to_u64(p))) { + i = i * 100000000 + parse_eight_digits_unrolled(read8_to_u64(p)); // in rare cases, this will overflow, but that's ok + p += 8; + } } template @@ -124,8 +214,10 @@ struct parsed_number_string_t { span integer{}; // non-nullable span fraction{}; // nullable }; -using byte_span = span; + +using byte_span = span; using parsed_number_string = parsed_number_string_t; + // Assuming that you use no more than 19 digits, this will // parse an ASCII string. template @@ -171,12 +263,8 @@ parsed_number_string_t parse_number_string(UC const *p, UC const * pend, par UC const * before = p; // can occur at most twice without overflowing, but let it occur more, since // for integers with many digits, digit parsing is the primary bottleneck. - if (std::is_same::value) { - while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) { - i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok - p += 8; - } - } + loop_parse_if_eight_digits(p, pend, i); + while ((p != pend) && is_integer(*p)) { uint8_t digit = uint8_t(*p - UC('0')); ++p; @@ -241,6 +329,7 @@ parsed_number_string_t parse_number_string(UC const *p, UC const * pend, par if(*start == UC('0')) { digit_count --; } start++; } + if (digit_count > 19) { answer.too_many_digits = true; // Let us start again, this time, avoiding overflows. @@ -248,22 +337,23 @@ parsed_number_string_t parse_number_string(UC const *p, UC const * pend, par // pre-tokenized spans from above. i = 0; p = answer.integer.ptr; - UC const * int_end = p + answer.integer.len(); - const uint64_t minimal_nineteen_digit_integer{1000000000000000000}; - while((i < minimal_nineteen_digit_integer) && (p != int_end)) { + UC const* int_end = p + answer.integer.len(); + const uint64_t minimal_nineteen_digit_integer{ 1000000000000000000 }; + while ((i < minimal_nineteen_digit_integer) && (p != int_end)) { i = i * 10 + uint64_t(*p - UC('0')); ++p; } if (i >= minimal_nineteen_digit_integer) { // We have a big integers exponent = end_of_integer_part - p + exp_number; - } else { // We have a value with a fractional component. - p = answer.fraction.ptr; - UC const * frac_end = p + answer.fraction.len(); - while((i < minimal_nineteen_digit_integer) && (p != frac_end)) { - i = i * 10 + uint64_t(*p - UC('0')); - ++p; - } - exponent = answer.fraction.ptr - p + exp_number; + } + else { // We have a value with a fractional component. + p = answer.fraction.ptr; + UC const* frac_end = p + answer.fraction.len(); + while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) { + i = i * 10 + uint64_t(*p - UC('0')); + ++p; + } + exponent = answer.fraction.ptr - p + exp_number; } // We have now corrected both exponent and i, to a truncated value } diff --git a/include/fast_float/digit_comparison.h b/include/fast_float/digit_comparison.h index f469f6b..512a27f 100644 --- a/include/fast_float/digit_comparison.h +++ b/include/fast_float/digit_comparison.h @@ -201,18 +201,10 @@ bool is_truncated(span s) noexcept { return is_truncated(s.ptr, s.ptr + s.len()); } -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -void parse_eight_digits(const char16_t*& , limb& , size_t& , size_t& ) noexcept { - // currently unused -} +template fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -void parse_eight_digits(const char32_t*& , limb& , size_t& , size_t& ) noexcept { - // currently unused -} - -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -void parse_eight_digits(const char*& p, limb& value, size_t& counter, size_t& count) noexcept { +void parse_eight_digits(const UC*& p, limb& value, size_t& counter, size_t& count) noexcept { value = value * 100000000 + parse_eight_digits_unrolled(p); p += 8; counter += 8; @@ -264,10 +256,8 @@ void parse_mantissa(bigint& result, parsed_number_string_t& num, size_t max_ skip_zeros(p, pend); // process all digits, in increments of step per loop while (p != pend) { - if (std::is_same::value) { - while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) { - parse_eight_digits(p, value, counter, digits); - } + while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) { + parse_eight_digits(p, value, counter, digits); } while (counter < step && p != pend && digits < max_digits) { parse_one_digit(p, value, counter, digits); @@ -299,10 +289,8 @@ void parse_mantissa(bigint& result, parsed_number_string_t& num, size_t max_ } // process all digits, in increments of step per loop while (p != pend) { - if (std::is_same::value) { - while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) { - parse_eight_digits(p, value, counter, digits); - } + while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) { + parse_eight_digits(p, value, counter, digits); } while (counter < step && p != pend && digits < max_digits) { parse_one_digit(p, value, counter, digits); diff --git a/include/fast_float/float_common.h b/include/fast_float/float_common.h index f82ab70..b1622b0 100644 --- a/include/fast_float/float_common.h +++ b/include/fast_float/float_common.h @@ -115,6 +115,34 @@ using parse_options = parse_options_t; #endif #endif +#if defined(__SSE2__) || \ + (defined(FASTFLOAT_VISUAL_STUDIO) && \ + (defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2))) +#define FASTFLOAT_SSE2 1 +#endif + +#ifdef FASTFLOAT_SSE2 +#define FASTFLOAT_HAS_SIMD 1 +#endif + +#if defined(__GNUC__) +// disable -Wcast-align=strict (GCC only) +#define FASTFLOAT_SIMD_DISABLE_WARNINGS \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wcast-align\"") +#else +#define FASTFLOAT_SIMD_DISABLE_WARNINGS +#endif + +#if defined(__GNUC__) +#define FASTFLOAT_SIMD_RESTORE_WARNINGS \ + _Pragma("GCC diagnostic pop") +#else +#define FASTFLOAT_SIMD_RESTORE_WARNINGS +#endif + + + #ifdef FASTFLOAT_VISUAL_STUDIO #define fastfloat_really_inline __forceinline #else @@ -132,6 +160,9 @@ using parse_options = parse_options_t; // rust style `try!()` macro, or `?` operator #define FASTFLOAT_TRY(x) { if (!(x)) return false; } +#define FASTFLOAT_ENABLE_IF(...) typename std::enable_if<(__VA_ARGS__), int>::type = 0 + + namespace fast_float { fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() { diff --git a/include/fast_float/parse_number.h b/include/fast_float/parse_number.h index 4541d70..e077b9d 100644 --- a/include/fast_float/parse_number.h +++ b/include/fast_float/parse_number.h @@ -166,6 +166,7 @@ from_chars_result_t from_chars_advanced(UC const * first, UC const * last, if (!pns.valid) { return detail::parse_infnan(first, last, value); } + answer.ec = std::errc(); // be optimistic answer.ptr = pns.lastmatch; // The implementation of the Clinger's fast path is convoluted because