replace checked re-parse with O(1) simdjson-style overflow check

The previous commit detects multi-wrap u64 overflow at the max_digits boundary by re-parsing the digits through a checked multiply-add loop (O(max_digits)). Replace that with the constant-time check used in simdjson: the leading digit plus a single threshold comparison. For a max_digits-length value, min_safe_u64(base) == base^(max_digits-1) is the smallest such value and also the width of each leading-digit band [d*ms, (d+1)*ms). Since that width is < 2^64, the only band that can straddle 2^64 is d == dmax (the largest leading digit that still fits), and there it straddles at most once, so a single threshold dmax*ms separates wrapped from non-wrapped values. A leading digit above dmax always overflows; below dmax always fits. dmax and the threshold derive from the existing min_safe_u64 table, so no new tables are needed and dmax*ms cannot itself overflow. Add a programmatic, self-verifying test for parse_int_string overflow detection covering bases 2..36, complementing the hand-picked strings added earlier. Every generated input is cross-checked against an independent trusted oracle (a plain 64-bit checked multiply-add); on success the parsed value is also compared exactly and full consumption of the input is asserted. Per base it exercises: - an exact-boundary sweep of the 64 values straddling 2^64 (UINT64_MAX-31 .. 2^64+31), built by walking the digit string; - UINT64_MAX, 2^64 and the all-max-digit value, each also with leading zeros; - random max_digits-length values across every leading digit, with the heaviest sampling on the lead == dmax band that straddles 2^64, and full coverage of lead > dmax (the multi-wrap region the naive min_safe check accepted by mistake); - max_digits-1 (never overflows) and max_digits+1 (always overflows). A small signed (int64_t) section checks the exact INT64_MIN/INT64_MAX limits round-trip and that INT64_MAX+1 / INT64_MIN-1 are rejected in every base.
2026-07-30 08:16:25 +08:00 · 2026-06-13 21:22:10 -04:00 · 2026-06-13 21:22:10 -04:00 · a7249f86ed
commit a7249f86ed
parent 632cc97b5b
2 changed files with 291 additions and 18 deletions
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@ -782,27 +782,26 @@ parse_int_string(UC const *p, UC const *pend, T &value,
  // this check can be eliminated for all other types, but they will all require
  // a max_digits(base) equivalent
  if (digit_count == max_digits) {
-    // A value that wrapped below the smallest max_digits-length value has
-    // certainly overflowed.
-    if (i < min_safe_u64(base)) {
+    // At the max_digits boundary the accumulator `i` may have wrapped around
+    // 2^64. A plain `i < min_safe_u64(base)` test is not sufficient: for any
+    // base whose max_digits-length range exceeds 2^64 (base 10 reaches
+    // ~5.4 * 2^64 at 20 digits) the value can wrap a whole multiple of 2^64 and
+    // land back above min_safe, slipping through. Decide exactly in O(1) using
+    // the leading digit, following the approach used in simdjson:
+    //   ms   == min_safe_u64(base) == base^(max_digits-1), the smallest
+    //           max_digits-length value.
+    //   dmax == the largest leading digit whose number can still fit in u64.
+    // The leading-digit band [d*ms, (d+1)*ms) has width ms < 2^64, so within
+    // the single band where d == dmax the value straddles 2^64 at most once,
+    // and a single threshold separates wrapped from non-wrapped values. A
+    // leading digit above dmax always overflows; below dmax always fits.
+    uint64_t const ms = min_safe_u64(base);
+    uint64_t const dmax = (std::numeric_limits<uint64_t>::max)() / ms;
+    uint64_t const lead = ch_to_digit(*start_digits);
+    if (lead > dmax || (lead == dmax && i < dmax * ms)) {
      answer.ec = std::errc::result_out_of_range;
      return answer;
    }
-    // i >= min_safe_u64(base) is still not proof that it fits: for any base
-    // whose max_digits-length range exceeds 2^64 (base 10 reaches ~5.4 * 2^64
-    // at 20 digits) the accumulator can wrap a whole multiple of 2^64 and land
-    // back above min_safe, so the test above lets that overflow through. Re-run
-    // the parsed digits with a checked multiply-add to decide exactly.
-    uint64_t overflow_check = 0;
-    for (UC const *q = start_digits; q != p; ++q) {
-      uint8_t const digit = ch_to_digit(*q);
-      if (overflow_check >
-          (std::numeric_limits<uint64_t>::max() - digit) / uint64_t(base)) {
-        answer.ec = std::errc::result_out_of_range;
-        return answer;
-      }
-      overflow_check = uint64_t(base) * overflow_check + digit;
-    }
  }

  // check other types overflow
--- a/tests/fast_int.cpp
+++ b/tests/fast_int.cpp
@ -17,7 +17,10 @@
 #include <iostream>
 #include <vector>
 #include <string_view>
+#include <string>
 #include <cstring>
+#include <random>
+#include <algorithm>
 #include "fast_float/fast_float.h"
 #include <cstdint>

@ -1404,6 +1407,277 @@ int main() {
    }
  }

+  // Comprehensive, oracle-checked u64 overflow detection across every base.
+  //
+  // The accumulator in parse_int_string is allowed to overflow and the result
+  // is validated afterwards. At the max_digits boundary a value can wrap one or
+  // more whole multiples of 2^64 (a 20-digit base-10 number reaches ~5.4*2^64),
+  // so the boundary check must be exact. This section validates from_chars for
+  // bases 2..36 against an independent, trusted oracle: a plain 64-bit checked
+  // multiply-add. It hammers the single leading-digit band that straddles 2^64
+  // (where wrapped and non-wrapped values are hardest to tell apart) and also
+  // covers max_digits-1 (always in range) and max_digits+1 (always overflow).
+  {
+    auto digit_to_char = [](int d) -> char {
+      return d < 10 ? char('0' + d) : char('A' + (d - 10));
+    };
+    auto char_to_digit = [](char c) -> int {
+      if (c >= '0' && c <= '9') {
+        return c - '0';
+      }
+      if (c >= 'A' && c <= 'Z') {
+        return c - 'A' + 10;
+      }
+      return c - 'a' + 10;
+    };
+    // Trusted oracle: parse `s` in `base` with a checked 64-bit multiply-add.
+    // Returns true on u64 overflow; otherwise writes the value to `out`.
+    auto oracle = [&](std::string const &s, int base, uint64_t &out) -> bool {
+      uint64_t v = 0;
+      for (char c : s) {
+        uint64_t const d = uint64_t(char_to_digit(c));
+        if (v > (UINT64_MAX - d) / uint64_t(base)) {
+          return true;
+        }
+        v = uint64_t(base) * v + d;
+      }
+      out = v;
+      return false;
+    };
+    auto to_base = [&](uint64_t v, int base) -> std::string {
+      if (v == 0) {
+        return "0";
+      }
+      std::string s;
+      while (v != 0) {
+        s += digit_to_char(int(v % uint64_t(base)));
+        v /= uint64_t(base);
+      }
+      std::reverse(s.begin(), s.end());
+      return s;
+    };
+    // Add one (in base `base`) to the digit string `s`, carrying as needed.
+    auto increment = [&](std::string s, int base) -> std::string {
+      int carry = 1;
+      for (std::size_t k = s.size(); k-- > 0 && carry != 0;) {
+        int const d = char_to_digit(s[k]) + carry;
+        carry = d / base;
+        s[k] = digit_to_char(d % base);
+      }
+      if (carry != 0) {
+        s.insert(s.begin(), digit_to_char(carry));
+      }
+      return s;
+    };
+
+    // Subtract one (in base `base`) from a non-zero, non-negative string.
+    auto decrement = [&](std::string s, int base) -> std::string {
+      int borrow = 1;
+      for (std::size_t k = s.size(); k-- > 0 && borrow != 0;) {
+        int d = char_to_digit(s[k]) - borrow;
+        borrow = d < 0 ? 1 : 0;
+        if (d < 0) {
+          d += base;
+        }
+        s[k] = digit_to_char(d);
+      }
+      std::size_t lead = s.find_first_not_of('0'); // drop any leading zero
+      return lead == std::string::npos ? "0" : s.substr(lead);
+    };
+
+    std::mt19937_64 rng(0xC0FFEEULL);
+    long long checked = 0;
+    auto verify = [&](std::string const &s, int base) -> bool {
+      uint64_t expected = 0;
+      bool const ov = oracle(s, base, expected);
+      uint64_t result = 0xDEADBEEFULL;
+      auto answer =
+          fast_float::from_chars(s.data(), s.data() + s.size(), result, base);
+      ++checked;
+      if (ov) {
+        if (answer.ec != std::errc::result_out_of_range) {
+          std::cerr << "base " << base
+                    << ": expected result_out_of_range for \"" << s << "\""
+                    << std::endl;
+          return false;
+        }
+      } else {
+        if (answer.ec != std::errc()) {
+          std::cerr << "base " << base << ": unexpected error for \"" << s
+                    << "\"" << std::endl;
+          return false;
+        }
+        if (result != expected) {
+          std::cerr << "base " << base << ": \"" << s << "\" -> " << result
+                    << ", expected " << expected << std::endl;
+          return false;
+        }
+        if (answer.ptr != s.data() + s.size()) {
+          std::cerr << "base " << base << ": did not consume all of \"" << s
+                    << "\"" << std::endl;
+          return false;
+        }
+      }
+      return true;
+    };
+    // Leading zeros are stripped before the digit count, so the outcome must be
+    // unchanged. Checked only on hand-picked values (it exercises shared code).
+    auto verify_zeros = [&](std::string const &digits, int base) -> bool {
+      return verify(digits, base) && verify("0" + digits, base) &&
+             verify(std::string(40, '0') + digits, base);
+    };
+    auto random_tail = [&](std::string &s, int n, int base) {
+      for (int k = 0; k < n; ++k) {
+        // bias toward the extremes (0 and base-1) to hit boundaries often
+        std::uint64_t const r = rng();
+        int const mode = int(r % 4);
+        int const dig = mode == 0   ? 0
+                        : mode == 1 ? base - 1
+                                    : int((r >> 2) % std::uint64_t(base));
+        s += digit_to_char(dig);
+      }
+    };
+
+    for (int base = 2; base <= 36; ++base) {
+      // M = max number of base-`base` digits a u64 can hold.
+      std::string const maxstr = to_base(UINT64_MAX, base);
+      int const M = int(maxstr.size());
+      // b^(M-1): smallest M-digit value, and width of each leading-digit band.
+      uint64_t bM1 = 1;
+      for (int k = 0; k < M - 1; ++k) {
+        bM1 *= uint64_t(base);
+      }
+      int const dmax = int(UINT64_MAX / bM1); // largest leading digit that fits
+
+      // Exact-boundary sweep straddling 2^64 (the hardest transition): the
+      // 64 values UINT64_MAX-31 .. UINT64_MAX (in range) and 2^64 .. 2^64+31
+      // (overflow), built by walking the digit string up and down.
+      std::string below = maxstr, above = increment(maxstr, base);
+      for (int k = 0; k < 32; ++k) {
+        if (!verify(below, base) || !verify(above, base)) {
+          return EXIT_FAILURE;
+        }
+        below = decrement(below, base);
+        above = increment(above, base);
+      }
+      // Hand-picked values, also checked with leading zeros.
+      std::string const allmax(std::size_t(M), digit_to_char(base - 1));
+      if (!verify_zeros(maxstr, base) || // largest in-range value
+          !verify_zeros(increment(maxstr, base), base) || // smallest overflow
+          !verify_zeros(allmax, base)) { // largest M-digit (multi-wrap)
+        return EXIT_FAILURE;
+      }
+
+      // Randomized M-digit values across every leading digit. Bands with
+      // lead > dmax always overflow (this is where the naive min_safe check
+      // wrongly accepted multi-wrap values); lead < dmax always fits; lead ==
+      // dmax straddles 2^64 and gets the heaviest sampling.
+      for (int lead = 1; lead < base; ++lead) {
+        int const trials = lead == dmax ? 4000 : 300;
+        for (int trial = 0; trial < trials; ++trial) {
+          std::string s(1, digit_to_char(lead));
+          random_tail(s, M - 1, base);
+          if (!verify(s, base)) {
+            return EXIT_FAILURE;
+          }
+        }
+      }
+      // max_digits-1 digits never overflow; max_digits+1 digits always do.
+      for (int trial = 0; trial < 500; ++trial) {
+        std::string shorts(1,
+                           digit_to_char(1 + int(rng() % uint64_t(base - 1))));
+        random_tail(shorts, M - 2, base);
+        std::string longs(1,
+                          digit_to_char(1 + int(rng() % uint64_t(base - 1))));
+        random_tail(longs, M, base);
+        if (!verify(shorts, base) || !verify(longs, base)) {
+          return EXIT_FAILURE;
+        }
+      }
+    }
+    if (checked < 100000) {
+      std::cerr << "overflow sweep ran too few cases: " << checked << std::endl;
+      return EXIT_FAILURE;
+    }
+  }
+
+  // Signed (int64_t) boundary: every value that overflows u64 also overflows
+  // i64, and the exact i64 limits must parse. Reuses the oracle indirectly via
+  // hand-built extremes per base.
+  {
+    auto digit_to_char = [](int d) -> char {
+      return d < 10 ? char('0' + d) : char('A' + (d - 10));
+    };
+    auto to_base_signed = [&](int64_t value, int base) -> std::string {
+      // value may be INT64_MIN; accumulate magnitude in u64 to avoid UB.
+      bool const neg = value < 0;
+      uint64_t mag = neg ? (~uint64_t(value) + 1) : uint64_t(value);
+      std::string s;
+      if (mag == 0) {
+        s = "0";
+      }
+      while (mag != 0) {
+        s += digit_to_char(int(mag % uint64_t(base)));
+        mag /= uint64_t(base);
+      }
+      if (neg) {
+        s += '-';
+      }
+      std::reverse(s.begin(), s.end());
+      return s;
+    };
+    for (int base = 2; base <= 36; ++base) {
+      struct {
+        int64_t v;
+      } const limits[] = {{INT64_MAX}, {INT64_MIN}, {0}, {-1}, {1}};
+
+      for (auto const &lim : limits) {
+        std::string const s = to_base_signed(lim.v, base);
+        int64_t result = 123;
+        auto answer =
+            fast_float::from_chars(s.data(), s.data() + s.size(), result, base);
+        if (answer.ec != std::errc() || result != lim.v) {
+          std::cerr << "base " << base << ": signed limit \"" << s
+                    << "\" failed to round-trip (got " << result << ")"
+                    << std::endl;
+          return EXIT_FAILURE;
+        }
+      }
+      // Increment a non-negative magnitude string (in `base`) by one.
+      auto inc_mag = [&](std::string m) -> std::string {
+        int carry = 1;
+        for (std::size_t k = m.size(); k-- > 0 && carry != 0;) {
+          int d = (m[k] >= '0' && m[k] <= '9')   ? m[k] - '0'
+                  : (m[k] >= 'A' && m[k] <= 'Z') ? m[k] - 'A' + 10
+                                                 : m[k] - 'a' + 10;
+          d += carry;
+          carry = d / base;
+          m[k] = digit_to_char(d % base);
+        }
+        if (carry != 0) {
+          m.insert(m.begin(), digit_to_char(carry));
+        }
+        return m;
+      };
+      // INT64_MAX + 1 (= 2^63) overflows a positive int64_t.
+      // INT64_MIN - 1 (= -(2^63 + 1)) overflows a negative int64_t.
+      // Note that -(2^63) == INT64_MIN is in range and is covered above.
+      std::string const max_mag = to_base_signed(INT64_MAX, base); // 2^63 - 1
+      std::string const over = inc_mag(max_mag);                   // 2^63
+      std::string const under = "-" + inc_mag(over); // -(2^63 + 1)
+      for (std::string const &s : {over, under}) {
+        int64_t result = 123;
+        auto answer =
+            fast_float::from_chars(s.data(), s.data() + s.size(), result, base);
+        if (answer.ec != std::errc::result_out_of_range) {
+          std::cerr << "base " << base << ": expected result_out_of_range for "
+                    << "signed \"" << s << "\"" << std::endl;
+          return EXIT_FAILURE;
+        }
+      }
+    }
+  }
+
  return EXIT_SUCCESS;
 }
 #else