review of the parse_number_string function: now it's much faster, safer and easy to understand.

2025-12-06 16:56:57 +08:00 · 2025-05-07 18:02:44 +03:00 · 2025-05-07 18:02:44 +03:00 · 23a9c3f54d
commit 23a9c3f54d
parent 4b94a612cf
1 changed files with 104 additions and 58 deletions
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@ -238,7 +238,7 @@ loop_parse_if_eight_digits(char const *&p, char const *const pend,
  }
 }

-enum class parse_error {
+enum class parse_error : uint_fast8_t {
  no_error,
  // A sign must be followed by an integer or dot.
  missing_integer_or_dot_after_sign,
@ -301,8 +301,8 @@ parse_number_string(UC const *p, UC const *pend,
  FASTFLOAT_ASSUME(p < pend); // so dereference without checks;
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
  answer.negative = (*p == UC('-'));
+  if (answer.negative ||
      // C++17 20.19.3.(7.1) explicitly forbids '+' sign here
-  if ((*p == UC('-')) ||
      ((chars_format_t(options.format & chars_format::allow_leading_plus)) &&
       (!basic_json_fmt && *p == UC('+')))) {
    ++p;
@ -338,10 +338,13 @@ parse_number_string(UC const *p, UC const *pend,
            *p - UC('0'))); // might overflow, we will handle the overflow later
    ++p;
  }
+
  UC const *const end_of_integer_part = p;
  am_digits digit_count =
      static_cast<am_digits>(end_of_integer_part - start_digits);
  answer.integer = span<UC const>(start_digits, digit_count);
+  // We have now parsed the integer part of the mantissa.
+
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
  FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) {
    // at least 1 digit in integer part, without leading zeros
@ -355,8 +358,8 @@ parse_number_string(UC const *p, UC const *pend,
  }
 #endif

-  bool const has_decimal_point = (p != pend) && (*p == options.decimal_point);
-  if (has_decimal_point) {
+  // We can now parse the fraction part of the mantissa.
+  if ((p != pend) && (*p == options.decimal_point)) {
    ++p;
    UC const *before = p;
    // can occur at most twice without overflowing, but let it occur more, since
@ -378,7 +381,7 @@ parse_number_string(UC const *p, UC const *pend,
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
    FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) {
      // at least 1 digit in fractional part
-      if (has_decimal_point && answer.exponent == 0) {
+      if (answer.exponent == 0) {
        return report_parse_error<UC>(
            p, parse_error::no_digits_in_fractional_part);
      }
@ -392,44 +395,79 @@ parse_number_string(UC const *p, UC const *pend,

  // Now we can parse the explicit exponential part.
  am_pow_t exp_number = 0; // explicit exponential part
-  if (((p != pend) &&
-           (((chars_format_t(options.format & chars_format::scientific)) &&
-             ((UC('e') == *p) || (UC('E') == *p))))
-#ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
-       || (((chars_format_t(options.format & detail::basic_fortran_fmt))) &&
-           ((UC('+') == *p) || (UC('-') == *p) || (UC('d') == *p) ||
-            (UC('D') == *p)))
-#endif
-           )) {
-    UC const *location_of_e = p;
-#ifdef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
-    ++p;
-#else
-    if ((UC('e') == *p) || (UC('E') == *p) || (UC('d') == *p) ||
-        (UC('D') == *p)) {
-      ++p;
-    }
-#endif
  bool neg_exp = false;
  if (p != pend) {
-      if (UC('-') == *p) {
+    UC const *location_of_e;
+    if (chars_format_t(options.format & chars_format::scientific)) {
+      switch (*p) {
+      case UC('e'):
+      case UC('E'):
+        location_of_e = p;
+        ++p;
+        break;
+      default:
+        // If it scientific and not fixed, we have to bail out.
+        if (!chars_format_t(options.format & chars_format::fixed)) {
+          return report_parse_error<UC>(p,
+                                        parse_error::missing_exponential_part);
+        }
+        // In fixed notation we will be ignoring the 'e'.
+        location_of_e = nullptr;
+      }
+      if (location_of_e && p != pend) {
+        switch (*p) {
+        case UC('-'):
          neg_exp = true;
          ++p;
-      } else if (UC('+') == *p) {
-        // '+' on exponent is allowed by C++17 20.19.3.(7.1)
+          break;
+        case UC('+'): // '+' on exponent is allowed by C++17 20.19.3.(7.1)
          ++p;
+          break;
+        default:
+          // If it scientific and not fixed, we have to bail out.
+          if (!chars_format_t(options.format & chars_format::fixed)) {
+            return report_parse_error<UC>(
+                p, parse_error::missing_exponential_part);
+          }
+          // In fixed notation we will be ignoring the 'e'.
+          location_of_e = nullptr;
        }
      }
-    if ((p == pend) || !is_integer(*p)) {
-      if (!(chars_format_t(options.format & chars_format::fixed))) {
-        // The exponential part is invalid for scientific notation, so it
-        // must be a trailing token for fixed notation. However, fixed
-        // notation is disabled, so report a scientific notation error.
+    }
+#ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
+    else if (chars_format_t(options.format & detail::basic_fortran_fmt)) {
+      switch (*p) {
+      case UC('d'):
+      case UC('D'):
+        ++p;
+        break;
+      default:
+        // In Fortran the d symbol is optional.
+        break;
+      }
+      switch (*p) {
+      case UC('-'):
+        neg_exp = true;
+        location_of_e = p;
+        ++p;
+        break;
+      case UC('+'):
+        location_of_e = p;
+        ++p;
+        break;
+      default:
+        // In Fortran the sign is mandatory.
        return report_parse_error<UC>(p, parse_error::missing_exponential_part);
      }
-      // Otherwise, we will be ignoring the 'e'.
-      p = location_of_e;
-    } else {
+    }
+#endif
+    else {
+      location_of_e = nullptr;
+    }
+
+    if (location_of_e) {
+      // We have a valid scientific notation, let's parse the explicit
+      // exponent.
      while ((p != pend) && is_integer(*p)) {
        if (exp_number < 0x1000) {
          // check for exponent overflow if we have too many digits.
@ -443,35 +481,41 @@ parse_number_string(UC const *p, UC const *pend,
      }
      answer.exponent += exp_number;
    }
-  } else {
-    // If it scientific and not fixed, we have to bail out.
-    if ((chars_format_t(options.format & chars_format::scientific)) &&
-        !(chars_format_t(options.format & chars_format::fixed))) {
+#ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
+    else if (chars_format_t(options.format & detail::basic_fortran_fmt)) {
+      // In Fortran the number in exponent part is mandatory.
      return report_parse_error<UC>(p, parse_error::missing_exponential_part);
    }
+#endif
  }
+
+  // We parsed all parts of the number, let's save progress.
  answer.lastmatch = p;
  answer.valid = true;

-  // If we frequently had to deal with long strings of digits,
+  // Now we can check for errors.
+
+  // TODO: If we frequently had to deal with long strings of digits,
  // we could extend our code by using a 128-bit integer instead
  // of a 64-bit integer. However, this is uncommon.
-  //
+
  // We can deal with up to 19 digits.
-  if (digit_count > 19) { // this is uncommon
+  if (digit_count > 19) {
    // It is possible that the integer had an overflow.
    // We have to handle the case where we have 0.0000somenumber.
    // We need to be mindful of the case where we only have zeroes...
    // E.g., 0.000000000...000.
    UC const *start = start_digits;
-    while ((start != pend) &&
-           (*start == UC('0') || *start == options.decimal_point)) {
+    do { // we already have some numbers, so we can skip first check safely
+      if ((*start == UC('0') || *start == options.decimal_point)) {
        if (*start == UC('0')) {
          --digit_count;
        }
-      ++start;
      }
+    } while (++start != pend);

+    // We have to check if we have a number with more than 19 significant
+    // digits.
    if (digit_count > 19) {
      answer.too_many_digits = true;
      // Let us start again, this time, avoiding overflows.
@ -480,19 +524,20 @@ parse_number_string(UC const *p, UC const *pend,
      answer.mantissa = 0;
      p = answer.integer.ptr;
      UC const *int_end = p + answer.integer.len();
-      am_mant_t const minimal_nineteen_digit_integer{1000000000000000000};
+      constexpr am_mant_t minimal_nineteen_digit_integer{1000000000000000000};
      while ((answer.mantissa < minimal_nineteen_digit_integer) &&
             (p != int_end)) {
        answer.mantissa = static_cast<am_mant_t>(
            answer.mantissa * 10 + static_cast<am_mant_t>(*p - UC('0')));
        ++p;
      }
-      if (answer.mantissa >=
-          minimal_nineteen_digit_integer) { // We have a big integers
+      if (answer.mantissa >= minimal_nineteen_digit_integer) {
+        // We have a big integers, so skip the fraction part completely.
        answer.exponent = am_pow_t(end_of_integer_part - p) + exp_number;
-      } else { // We have a value with a fractional component.
+      } else {
+        // We have a value with a significant fractional component.
        p = answer.fraction.ptr;
-        UC const *frac_end = p + answer.fraction.len();
+        UC const *const frac_end = p + answer.fraction.len();
        while ((answer.mantissa < minimal_nineteen_digit_integer) &&
               (p != frac_end)) {
          answer.mantissa = static_cast<am_mant_t>(
@ -501,9 +546,10 @@ parse_number_string(UC const *p, UC const *pend,
        }
        answer.exponent = am_pow_t(answer.fraction.ptr - p) + exp_number;
      }
+    }
    // We have now corrected both exponent and mantissa, to a truncated value
  }
-  }
+
  return answer;
 }