review of the parse_number_string function: now it's much faster, safer and easy to understand.

2026-02-06 17:59:52 +08:00 · 2025-05-07 18:02:44 +03:00 · 2025-05-07 18:02:44 +03:00 · 23a9c3f54d
commit 23a9c3f54d
parent 4b94a612cf
1 changed files with 104 additions and 58 deletions
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@ -238,7 +238,7 @@ loop_parse_if_eight_digits(char const *&p, char const *const pend,
  }
 }
-enum class parse_error {
+enum class parse_error : uint_fast8_t {
  no_error,
  // A sign must be followed by an integer or dot.
  missing_integer_or_dot_after_sign,
@ -301,8 +301,8 @@ parse_number_string(UC const *p, UC const *pend,
  FASTFLOAT_ASSUME(p < pend); // so dereference without checks;
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
  answer.negative = (*p == UC('-'));
-  // C++17 20.19.3.(7.1) explicitly forbids '+' sign here
+  if (answer.negative ||
-  if ((*p == UC('-')) ||
+      // C++17 20.19.3.(7.1) explicitly forbids '+' sign here
      ((chars_format_t(options.format & chars_format::allow_leading_plus)) &&
       (!basic_json_fmt && *p == UC('+')))) {
    ++p;
@ -338,10 +338,13 @@ parse_number_string(UC const *p, UC const *pend,
            *p - UC('0'))); // might overflow, we will handle the overflow later
    ++p;
  }
  UC const *const end_of_integer_part = p;
  am_digits digit_count =
      static_cast<am_digits>(end_of_integer_part - start_digits);
  answer.integer = span<UC const>(start_digits, digit_count);
  // We have now parsed the integer part of the mantissa.
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
  FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) {
    // at least 1 digit in integer part, without leading zeros
@ -355,8 +358,8 @@ parse_number_string(UC const *p, UC const *pend,
  }
 #endif
-  bool const has_decimal_point = (p != pend) && (*p == options.decimal_point);
+  // We can now parse the fraction part of the mantissa.
-  if (has_decimal_point) {
+  if ((p != pend) && (*p == options.decimal_point)) {
    ++p;
    UC const *before = p;
    // can occur at most twice without overflowing, but let it occur more, since
@ -378,7 +381,7 @@ parse_number_string(UC const *p, UC const *pend,
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
    FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) {
      // at least 1 digit in fractional part
-      if (has_decimal_point && answer.exponent == 0) {
+      if (answer.exponent == 0) {
        return report_parse_error<UC>(
            p, parse_error::no_digits_in_fractional_part);
      }
@ -392,44 +395,79 @@ parse_number_string(UC const *p, UC const *pend,
  // Now we can parse the explicit exponential part.
  am_pow_t exp_number = 0; // explicit exponential part
-  if (((p != pend) &&
+  bool neg_exp = false;
-           (((chars_format_t(options.format & chars_format::scientific)) &&
+  if (p != pend) {
-             ((UC('e') == *p) || (UC('E') == *p))))
+    UC const *location_of_e;
-#ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
+    if (chars_format_t(options.format & chars_format::scientific)) {
-       || (((chars_format_t(options.format & detail::basic_fortran_fmt))) &&
+      switch (*p) {
-           ((UC('+') == *p) || (UC('-') == *p) || (UC('d') == *p) ||
+      case UC('e'):
-            (UC('D') == *p)))
+      case UC('E'):
-#endif
+        location_of_e = p;
           )) {
    UC const *location_of_e = p;
 #ifdef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
    ++p;
 #else
    if ((UC('e') == *p) || (UC('E') == *p) || (UC('d') == *p) ||
        (UC('D') == *p)) {
      ++p;
    }
 #endif
    bool neg_exp = false;
    if (p != pend) {
      if (UC('-') == *p) {
        neg_exp = true;
        ++p;
      } else if (UC('+') == *p) {
        // '+' on exponent is allowed by C++17 20.19.3.(7.1)
        ++p;
        break;
      default:
        // If it scientific and not fixed, we have to bail out.
        if (!chars_format_t(options.format & chars_format::fixed)) {
          return report_parse_error<UC>(p,
                                        parse_error::missing_exponential_part);
        }
        // In fixed notation we will be ignoring the 'e'.
        location_of_e = nullptr;
      }
      if (location_of_e && p != pend) {
        switch (*p) {
        case UC('-'):
          neg_exp = true;
          ++p;
          break;
        case UC('+'): // '+' on exponent is allowed by C++17 20.19.3.(7.1)
          ++p;
          break;
        default:
          // If it scientific and not fixed, we have to bail out.
          if (!chars_format_t(options.format & chars_format::fixed)) {
            return report_parse_error<UC>(
                p, parse_error::missing_exponential_part);
          }
          // In fixed notation we will be ignoring the 'e'.
          location_of_e = nullptr;
        }
      }
    }
-    if ((p == pend) || !is_integer(*p)) {
+#ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
-      if (!(chars_format_t(options.format & chars_format::fixed))) {
+    else if (chars_format_t(options.format & detail::basic_fortran_fmt)) {
-        // The exponential part is invalid for scientific notation, so it
+      switch (*p) {
-        // must be a trailing token for fixed notation. However, fixed
+      case UC('d'):
-        // notation is disabled, so report a scientific notation error.
+      case UC('D'):
        ++p;
        break;
      default:
        // In Fortran the d symbol is optional.
        break;
      }
      switch (*p) {
      case UC('-'):
        neg_exp = true;
        location_of_e = p;
        ++p;
        break;
      case UC('+'):
        location_of_e = p;
        ++p;
        break;
      default:
        // In Fortran the sign is mandatory.
        return report_parse_error<UC>(p, parse_error::missing_exponential_part);
      }
-      // Otherwise, we will be ignoring the 'e'.
+    }
-      p = location_of_e;
+#endif
-    } else {
+    else {
      location_of_e = nullptr;
    }
    if (location_of_e) {
      // We have a valid scientific notation, let's parse the explicit
      // exponent.
      while ((p != pend) && is_integer(*p)) {
        if (exp_number < 0x1000) {
          // check for exponent overflow if we have too many digits.
@ -443,35 +481,41 @@ parse_number_string(UC const *p, UC const *pend,
      }
      answer.exponent += exp_number;
    }
-  } else {
+#ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
-    // If it scientific and not fixed, we have to bail out.
+    else if (chars_format_t(options.format & detail::basic_fortran_fmt)) {
-    if ((chars_format_t(options.format & chars_format::scientific)) &&
+      // In Fortran the number in exponent part is mandatory.
        !(chars_format_t(options.format & chars_format::fixed))) {
      return report_parse_error<UC>(p, parse_error::missing_exponential_part);
    }
 #endif
  }
  // We parsed all parts of the number, let's save progress.
  answer.lastmatch = p;
  answer.valid = true;
-  // If we frequently had to deal with long strings of digits,
+  // Now we can check for errors.
  // TODO: If we frequently had to deal with long strings of digits,
  // we could extend our code by using a 128-bit integer instead
  // of a 64-bit integer. However, this is uncommon.
-  //
+
  // We can deal with up to 19 digits.
-  if (digit_count > 19) { // this is uncommon
+  if (digit_count > 19) {
    // It is possible that the integer had an overflow.
    // We have to handle the case where we have 0.0000somenumber.
    // We need to be mindful of the case where we only have zeroes...
    // E.g., 0.000000000...000.
    UC const *start = start_digits;
-    while ((start != pend) &&
+    do { // we already have some numbers, so we can skip first check safely
-           (*start == UC('0') || *start == options.decimal_point)) {
+      if ((*start == UC('0') || *start == options.decimal_point)) {
-      if (*start == UC('0')) {
+        if (*start == UC('0')) {
-        --digit_count;
+          --digit_count;
        }
      }
-      ++start;
+    } while (++start != pend);
    }
    // We have to check if we have a number with more than 19 significant
    // digits.
    if (digit_count > 19) {
      answer.too_many_digits = true;
      // Let us start again, this time, avoiding overflows.
@ -480,19 +524,20 @@ parse_number_string(UC const *p, UC const *pend,
      answer.mantissa = 0;
      p = answer.integer.ptr;
      UC const *int_end = p + answer.integer.len();
-      am_mant_t const minimal_nineteen_digit_integer{1000000000000000000};
+      constexpr am_mant_t minimal_nineteen_digit_integer{1000000000000000000};
      while ((answer.mantissa < minimal_nineteen_digit_integer) &&
             (p != int_end)) {
        answer.mantissa = static_cast<am_mant_t>(
            answer.mantissa * 10 + static_cast<am_mant_t>(*p - UC('0')));
        ++p;
      }
-      if (answer.mantissa >=
+      if (answer.mantissa >= minimal_nineteen_digit_integer) {
-          minimal_nineteen_digit_integer) { // We have a big integers
+        // We have a big integers, so skip the fraction part completely.
        answer.exponent = am_pow_t(end_of_integer_part - p) + exp_number;
-      } else { // We have a value with a fractional component.
+      } else {
        // We have a value with a significant fractional component.
        p = answer.fraction.ptr;
-        UC const *frac_end = p + answer.fraction.len();
+        UC const *const frac_end = p + answer.fraction.len();
        while ((answer.mantissa < minimal_nineteen_digit_integer) &&
               (p != frac_end)) {
          answer.mantissa = static_cast<am_mant_t>(
@ -501,9 +546,10 @@ parse_number_string(UC const *p, UC const *pend,
        }
        answer.exponent = am_pow_t(answer.fraction.ptr - p) + exp_number;
      }
      // We have now corrected both exponent and mantissa, to a truncated value
    }
    // We have now corrected both exponent and mantissa, to a truncated value
  }
  return answer;
 }