From 933d43b5caefe47097debc86cd6a644242a5398c Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Wed, 4 Nov 2020 20:38:43 -0500 Subject: [PATCH] This is an experimental branch that might lead to some faster performance. It is currently unusable. --- include/fast_float/ascii_number.h | 88 +++++++++++++++++++------------ 1 file changed, 53 insertions(+), 35 deletions(-) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index b372367..55507d4 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -67,6 +67,7 @@ fastfloat_really_inline parsed_number_string parse_number_string(const char *p, const char *pend, chars_format fmt) noexcept { parsed_number_string answer; answer.valid = false; + answer.too_many_digits = false; answer.negative = (*p == '-'); if ((*p == '-') || (*p == '+')) { ++p; @@ -78,43 +79,80 @@ parsed_number_string parse_number_string(const char *p, const char *pend, chars_ } } const char *const start_digits = p; + // skip leading zeroes + while ((p != pend) && (*p == '0')) { p++; } + + // We can go forward up to 19 characters without overflow for sure, we might even go 20 characters + // or more if we have a decimal separator. We will adjust accordingly. + const char *pend_overflow_free = p + 19 > pend ? pend : p + 19; uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) - while ((p != pend) && is_integer(*p)) { + while ((p != pend_overflow_free) && is_integer(*p)) { // a multiplication by 10 is cheaper than an arbitrary integer // multiplication i = 10 * i + - (*p - '0'); // might overflow, we will handle the overflow later + (*p - '0'); ++p; } int64_t exponent = 0; - if ((p != pend) && (*p == '.')) { + if ((p != pend_overflow_free) && (*p == '.')) { ++p; const char *first_after_period = p; - if ((p + 8 <= pend) && is_made_of_eight_digits_fast(p)) { - i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok + if (i == 0) { + // Keep on skipping leading zeroes avec the decimal separator. + while ((p != pend) && (*p == '0')) { p++; } + // reset the ending point + pend_overflow_free = p + 19 > pend ? pend : p + 19; + } else if(pend_overflow_free < pend) { + pend_overflow_free++; // go one further thanks to '.' + } + if ((p + 8 <= pend_overflow_free) && is_made_of_eight_digits_fast(p)) { + i = i * 100000000 + parse_eight_digits_unrolled(p); p += 8; - if ((p + 8 <= pend) && is_made_of_eight_digits_fast(p)) { - i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok + if ((p + 8 <= pend_overflow_free) && is_made_of_eight_digits_fast(p)) { + i = i * 100000000 + parse_eight_digits_unrolled(p); p += 8; } } - while ((p != pend) && is_integer(*p)) { + while ((p != pend_overflow_free) && is_integer(*p)) { uint8_t digit = uint8_t(*p - '0'); ++p; - i = i * 10 + digit; // in rare cases, this will overflow, but that's ok + i = i * 10 + digit; } exponent = first_after_period - p; } // we must have encountered at least one integer! - if ((start_digits == p) || ((start_digits == p - 1) && (*start_digits == '.') )) { - return answer; + // We only need this check if i == 0 which is preditably unlikely. + if(i == 0) { + if ((start_digits == p) || ((start_digits == p - 1) && (*start_digits == '.') )) { + return answer; + } + } + if((p == pend_overflow_free) && (pend_overflow_free < pend)) { // We possibly have an overflow! + bool found_non_zero{false}; + if((exponent == 0) && (*(p-1) != '.')) { + // We have not yet encountered the '.' + // We do the pre-decimal part first. + while ((p != pend) && is_integer(*p)) { + found_non_zero |= (*p != '0'); + p++; + exponent += 1; + } + if ((p != pend) && (*p == '.')) { p++; } + while ((p != pend) && is_integer(*p)) { + found_non_zero |= (*p != '0'); + p++; + } + } else { + // This is the easy case, we just have to skip all of the digits! + while ((p != pend) && is_integer(*p)) { + found_non_zero |= (*p != '0'); + p++; + } + } + answer.too_many_digits = found_non_zero; } - - int32_t digit_count = - int32_t(p - start_digits - 1); // used later to guard against overflows - if ((p != pend) && (('e' == *p) || ('E' == *p))) { if((fmt & chars_format::fixed) && !(fmt & chars_format::scientific)) { return answer; } int64_t exp_number = 0; // exponential part @@ -142,26 +180,6 @@ parsed_number_string parse_number_string(const char *p, const char *pend, chars_ } answer.lastmatch = p; answer.valid = true; - - // If we frequently had to deal with long strings of digits, - // we could extend our code by using a 128-bit integer instead - // of a 64-bit integer. However, this is uncommon. - if (((digit_count >= 19))) { // this is uncommon - // It is possible that the integer had an overflow. - // We have to handle the case where we have 0.0000somenumber. - const char *start = start_digits; - while (*start == '0' || (*start == '.')) { - start++; - } - // we over-decrement by one when there is a decimal separator - digit_count -= int(start - start_digits); - if (digit_count >= 19) { - answer.mantissa = 0xFFFFFFFFFFFFFFFF; // important: we don't want the mantissa to be used in a fast path uninitialized. - answer.too_many_digits = true; - return answer; - } - } - answer.too_many_digits = false; answer.exponent = exponent; answer.mantissa = i; return answer;