From 23a9c3f54dc44fae680af42fafbf5a4b811112ce Mon Sep 17 00:00:00 2001 From: IRainman Date: Wed, 7 May 2025 18:02:44 +0300 Subject: [PATCH] review of the parse_number_string function: now it's much faster, safer and easy to understand. --- include/fast_float/ascii_number.h | 162 +++++++++++++++++++----------- 1 file changed, 104 insertions(+), 58 deletions(-) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index 144ca59..540877a 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -238,7 +238,7 @@ loop_parse_if_eight_digits(char const *&p, char const *const pend, } } -enum class parse_error { +enum class parse_error : uint_fast8_t { no_error, // A sign must be followed by an integer or dot. missing_integer_or_dot_after_sign, @@ -301,8 +301,8 @@ parse_number_string(UC const *p, UC const *pend, FASTFLOAT_ASSUME(p < pend); // so dereference without checks; #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN answer.negative = (*p == UC('-')); - // C++17 20.19.3.(7.1) explicitly forbids '+' sign here - if ((*p == UC('-')) || + if (answer.negative || + // C++17 20.19.3.(7.1) explicitly forbids '+' sign here ((chars_format_t(options.format & chars_format::allow_leading_plus)) && (!basic_json_fmt && *p == UC('+')))) { ++p; @@ -338,10 +338,13 @@ parse_number_string(UC const *p, UC const *pend, *p - UC('0'))); // might overflow, we will handle the overflow later ++p; } + UC const *const end_of_integer_part = p; am_digits digit_count = static_cast(end_of_integer_part - start_digits); answer.integer = span(start_digits, digit_count); + // We have now parsed the integer part of the mantissa. + #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) { // at least 1 digit in integer part, without leading zeros @@ -355,8 +358,8 @@ parse_number_string(UC const *p, UC const *pend, } #endif - bool const has_decimal_point = (p != pend) && (*p == options.decimal_point); - if (has_decimal_point) { + // We can now parse the fraction part of the mantissa. + if ((p != pend) && (*p == options.decimal_point)) { ++p; UC const *before = p; // can occur at most twice without overflowing, but let it occur more, since @@ -378,7 +381,7 @@ parse_number_string(UC const *p, UC const *pend, #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) { // at least 1 digit in fractional part - if (has_decimal_point && answer.exponent == 0) { + if (answer.exponent == 0) { return report_parse_error( p, parse_error::no_digits_in_fractional_part); } @@ -392,44 +395,79 @@ parse_number_string(UC const *p, UC const *pend, // Now we can parse the explicit exponential part. am_pow_t exp_number = 0; // explicit exponential part - if (((p != pend) && - (((chars_format_t(options.format & chars_format::scientific)) && - ((UC('e') == *p) || (UC('E') == *p)))) -#ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN - || (((chars_format_t(options.format & detail::basic_fortran_fmt))) && - ((UC('+') == *p) || (UC('-') == *p) || (UC('d') == *p) || - (UC('D') == *p))) -#endif - )) { - UC const *location_of_e = p; -#ifdef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN - ++p; -#else - if ((UC('e') == *p) || (UC('E') == *p) || (UC('d') == *p) || - (UC('D') == *p)) { - ++p; - } -#endif - bool neg_exp = false; - if (p != pend) { - if (UC('-') == *p) { - neg_exp = true; - ++p; - } else if (UC('+') == *p) { - // '+' on exponent is allowed by C++17 20.19.3.(7.1) + bool neg_exp = false; + if (p != pend) { + UC const *location_of_e; + if (chars_format_t(options.format & chars_format::scientific)) { + switch (*p) { + case UC('e'): + case UC('E'): + location_of_e = p; ++p; + break; + default: + // If it scientific and not fixed, we have to bail out. + if (!chars_format_t(options.format & chars_format::fixed)) { + return report_parse_error(p, + parse_error::missing_exponential_part); + } + // In fixed notation we will be ignoring the 'e'. + location_of_e = nullptr; + } + if (location_of_e && p != pend) { + switch (*p) { + case UC('-'): + neg_exp = true; + ++p; + break; + case UC('+'): // '+' on exponent is allowed by C++17 20.19.3.(7.1) + ++p; + break; + default: + // If it scientific and not fixed, we have to bail out. + if (!chars_format_t(options.format & chars_format::fixed)) { + return report_parse_error( + p, parse_error::missing_exponential_part); + } + // In fixed notation we will be ignoring the 'e'. + location_of_e = nullptr; + } } } - if ((p == pend) || !is_integer(*p)) { - if (!(chars_format_t(options.format & chars_format::fixed))) { - // The exponential part is invalid for scientific notation, so it - // must be a trailing token for fixed notation. However, fixed - // notation is disabled, so report a scientific notation error. +#ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN + else if (chars_format_t(options.format & detail::basic_fortran_fmt)) { + switch (*p) { + case UC('d'): + case UC('D'): + ++p; + break; + default: + // In Fortran the d symbol is optional. + break; + } + switch (*p) { + case UC('-'): + neg_exp = true; + location_of_e = p; + ++p; + break; + case UC('+'): + location_of_e = p; + ++p; + break; + default: + // In Fortran the sign is mandatory. return report_parse_error(p, parse_error::missing_exponential_part); } - // Otherwise, we will be ignoring the 'e'. - p = location_of_e; - } else { + } +#endif + else { + location_of_e = nullptr; + } + + if (location_of_e) { + // We have a valid scientific notation, let's parse the explicit + // exponent. while ((p != pend) && is_integer(*p)) { if (exp_number < 0x1000) { // check for exponent overflow if we have too many digits. @@ -443,35 +481,41 @@ parse_number_string(UC const *p, UC const *pend, } answer.exponent += exp_number; } - } else { - // If it scientific and not fixed, we have to bail out. - if ((chars_format_t(options.format & chars_format::scientific)) && - !(chars_format_t(options.format & chars_format::fixed))) { +#ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN + else if (chars_format_t(options.format & detail::basic_fortran_fmt)) { + // In Fortran the number in exponent part is mandatory. return report_parse_error(p, parse_error::missing_exponential_part); } +#endif } + + // We parsed all parts of the number, let's save progress. answer.lastmatch = p; answer.valid = true; - // If we frequently had to deal with long strings of digits, + // Now we can check for errors. + + // TODO: If we frequently had to deal with long strings of digits, // we could extend our code by using a 128-bit integer instead // of a 64-bit integer. However, this is uncommon. - // + // We can deal with up to 19 digits. - if (digit_count > 19) { // this is uncommon + if (digit_count > 19) { // It is possible that the integer had an overflow. // We have to handle the case where we have 0.0000somenumber. // We need to be mindful of the case where we only have zeroes... // E.g., 0.000000000...000. UC const *start = start_digits; - while ((start != pend) && - (*start == UC('0') || *start == options.decimal_point)) { - if (*start == UC('0')) { - --digit_count; + do { // we already have some numbers, so we can skip first check safely + if ((*start == UC('0') || *start == options.decimal_point)) { + if (*start == UC('0')) { + --digit_count; + } } - ++start; - } + } while (++start != pend); + // We have to check if we have a number with more than 19 significant + // digits. if (digit_count > 19) { answer.too_many_digits = true; // Let us start again, this time, avoiding overflows. @@ -480,19 +524,20 @@ parse_number_string(UC const *p, UC const *pend, answer.mantissa = 0; p = answer.integer.ptr; UC const *int_end = p + answer.integer.len(); - am_mant_t const minimal_nineteen_digit_integer{1000000000000000000}; + constexpr am_mant_t minimal_nineteen_digit_integer{1000000000000000000}; while ((answer.mantissa < minimal_nineteen_digit_integer) && (p != int_end)) { answer.mantissa = static_cast( answer.mantissa * 10 + static_cast(*p - UC('0'))); ++p; } - if (answer.mantissa >= - minimal_nineteen_digit_integer) { // We have a big integers + if (answer.mantissa >= minimal_nineteen_digit_integer) { + // We have a big integers, so skip the fraction part completely. answer.exponent = am_pow_t(end_of_integer_part - p) + exp_number; - } else { // We have a value with a fractional component. + } else { + // We have a value with a significant fractional component. p = answer.fraction.ptr; - UC const *frac_end = p + answer.fraction.len(); + UC const *const frac_end = p + answer.fraction.len(); while ((answer.mantissa < minimal_nineteen_digit_integer) && (p != frac_end)) { answer.mantissa = static_cast( @@ -501,9 +546,10 @@ parse_number_string(UC const *p, UC const *pend, } answer.exponent = am_pow_t(answer.fraction.ptr - p) + exp_number; } - // We have now corrected both exponent and mantissa, to a truncated value } + // We have now corrected both exponent and mantissa, to a truncated value } + return answer; }