diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index f895e9b..e6ead45 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -782,27 +782,26 @@ parse_int_string(UC const *p, UC const *pend, T &value, // this check can be eliminated for all other types, but they will all require // a max_digits(base) equivalent if (digit_count == max_digits) { - // A value that wrapped below the smallest max_digits-length value has - // certainly overflowed. - if (i < min_safe_u64(base)) { + // At the max_digits boundary the accumulator `i` may have wrapped around + // 2^64. A plain `i < min_safe_u64(base)` test is not sufficient: for any + // base whose max_digits-length range exceeds 2^64 (base 10 reaches + // ~5.4 * 2^64 at 20 digits) the value can wrap a whole multiple of 2^64 and + // land back above min_safe, slipping through. Decide exactly in O(1) using + // the leading digit, following the approach used in simdjson: + // ms == min_safe_u64(base) == base^(max_digits-1), the smallest + // max_digits-length value. + // dmax == the largest leading digit whose number can still fit in u64. + // The leading-digit band [d*ms, (d+1)*ms) has width ms < 2^64, so within + // the single band where d == dmax the value straddles 2^64 at most once, + // and a single threshold separates wrapped from non-wrapped values. A + // leading digit above dmax always overflows; below dmax always fits. + uint64_t const ms = min_safe_u64(base); + uint64_t const dmax = (std::numeric_limits::max)() / ms; + uint64_t const lead = ch_to_digit(*start_digits); + if (lead > dmax || (lead == dmax && i < dmax * ms)) { answer.ec = std::errc::result_out_of_range; return answer; } - // i >= min_safe_u64(base) is still not proof that it fits: for any base - // whose max_digits-length range exceeds 2^64 (base 10 reaches ~5.4 * 2^64 - // at 20 digits) the accumulator can wrap a whole multiple of 2^64 and land - // back above min_safe, so the test above lets that overflow through. Re-run - // the parsed digits with a checked multiply-add to decide exactly. - uint64_t overflow_check = 0; - for (UC const *q = start_digits; q != p; ++q) { - uint8_t const digit = ch_to_digit(*q); - if (overflow_check > - (std::numeric_limits::max() - digit) / uint64_t(base)) { - answer.ec = std::errc::result_out_of_range; - return answer; - } - overflow_check = uint64_t(base) * overflow_check + digit; - } } // check other types overflow diff --git a/tests/fast_int.cpp b/tests/fast_int.cpp index 9762f96..52ad5a6 100644 --- a/tests/fast_int.cpp +++ b/tests/fast_int.cpp @@ -17,7 +17,10 @@ #include #include #include +#include #include +#include +#include #include "fast_float/fast_float.h" #include @@ -1404,6 +1407,277 @@ int main() { } } + // Comprehensive, oracle-checked u64 overflow detection across every base. + // + // The accumulator in parse_int_string is allowed to overflow and the result + // is validated afterwards. At the max_digits boundary a value can wrap one or + // more whole multiples of 2^64 (a 20-digit base-10 number reaches ~5.4*2^64), + // so the boundary check must be exact. This section validates from_chars for + // bases 2..36 against an independent, trusted oracle: a plain 64-bit checked + // multiply-add. It hammers the single leading-digit band that straddles 2^64 + // (where wrapped and non-wrapped values are hardest to tell apart) and also + // covers max_digits-1 (always in range) and max_digits+1 (always overflow). + { + auto digit_to_char = [](int d) -> char { + return d < 10 ? char('0' + d) : char('A' + (d - 10)); + }; + auto char_to_digit = [](char c) -> int { + if (c >= '0' && c <= '9') { + return c - '0'; + } + if (c >= 'A' && c <= 'Z') { + return c - 'A' + 10; + } + return c - 'a' + 10; + }; + // Trusted oracle: parse `s` in `base` with a checked 64-bit multiply-add. + // Returns true on u64 overflow; otherwise writes the value to `out`. + auto oracle = [&](std::string const &s, int base, uint64_t &out) -> bool { + uint64_t v = 0; + for (char c : s) { + uint64_t const d = uint64_t(char_to_digit(c)); + if (v > (UINT64_MAX - d) / uint64_t(base)) { + return true; + } + v = uint64_t(base) * v + d; + } + out = v; + return false; + }; + auto to_base = [&](uint64_t v, int base) -> std::string { + if (v == 0) { + return "0"; + } + std::string s; + while (v != 0) { + s += digit_to_char(int(v % uint64_t(base))); + v /= uint64_t(base); + } + std::reverse(s.begin(), s.end()); + return s; + }; + // Add one (in base `base`) to the digit string `s`, carrying as needed. + auto increment = [&](std::string s, int base) -> std::string { + int carry = 1; + for (std::size_t k = s.size(); k-- > 0 && carry != 0;) { + int const d = char_to_digit(s[k]) + carry; + carry = d / base; + s[k] = digit_to_char(d % base); + } + if (carry != 0) { + s.insert(s.begin(), digit_to_char(carry)); + } + return s; + }; + + // Subtract one (in base `base`) from a non-zero, non-negative string. + auto decrement = [&](std::string s, int base) -> std::string { + int borrow = 1; + for (std::size_t k = s.size(); k-- > 0 && borrow != 0;) { + int d = char_to_digit(s[k]) - borrow; + borrow = d < 0 ? 1 : 0; + if (d < 0) { + d += base; + } + s[k] = digit_to_char(d); + } + std::size_t lead = s.find_first_not_of('0'); // drop any leading zero + return lead == std::string::npos ? "0" : s.substr(lead); + }; + + std::mt19937_64 rng(0xC0FFEEULL); + long long checked = 0; + auto verify = [&](std::string const &s, int base) -> bool { + uint64_t expected = 0; + bool const ov = oracle(s, base, expected); + uint64_t result = 0xDEADBEEFULL; + auto answer = + fast_float::from_chars(s.data(), s.data() + s.size(), result, base); + ++checked; + if (ov) { + if (answer.ec != std::errc::result_out_of_range) { + std::cerr << "base " << base + << ": expected result_out_of_range for \"" << s << "\"" + << std::endl; + return false; + } + } else { + if (answer.ec != std::errc()) { + std::cerr << "base " << base << ": unexpected error for \"" << s + << "\"" << std::endl; + return false; + } + if (result != expected) { + std::cerr << "base " << base << ": \"" << s << "\" -> " << result + << ", expected " << expected << std::endl; + return false; + } + if (answer.ptr != s.data() + s.size()) { + std::cerr << "base " << base << ": did not consume all of \"" << s + << "\"" << std::endl; + return false; + } + } + return true; + }; + // Leading zeros are stripped before the digit count, so the outcome must be + // unchanged. Checked only on hand-picked values (it exercises shared code). + auto verify_zeros = [&](std::string const &digits, int base) -> bool { + return verify(digits, base) && verify("0" + digits, base) && + verify(std::string(40, '0') + digits, base); + }; + auto random_tail = [&](std::string &s, int n, int base) { + for (int k = 0; k < n; ++k) { + // bias toward the extremes (0 and base-1) to hit boundaries often + std::uint64_t const r = rng(); + int const mode = int(r % 4); + int const dig = mode == 0 ? 0 + : mode == 1 ? base - 1 + : int((r >> 2) % std::uint64_t(base)); + s += digit_to_char(dig); + } + }; + + for (int base = 2; base <= 36; ++base) { + // M = max number of base-`base` digits a u64 can hold. + std::string const maxstr = to_base(UINT64_MAX, base); + int const M = int(maxstr.size()); + // b^(M-1): smallest M-digit value, and width of each leading-digit band. + uint64_t bM1 = 1; + for (int k = 0; k < M - 1; ++k) { + bM1 *= uint64_t(base); + } + int const dmax = int(UINT64_MAX / bM1); // largest leading digit that fits + + // Exact-boundary sweep straddling 2^64 (the hardest transition): the + // 64 values UINT64_MAX-31 .. UINT64_MAX (in range) and 2^64 .. 2^64+31 + // (overflow), built by walking the digit string up and down. + std::string below = maxstr, above = increment(maxstr, base); + for (int k = 0; k < 32; ++k) { + if (!verify(below, base) || !verify(above, base)) { + return EXIT_FAILURE; + } + below = decrement(below, base); + above = increment(above, base); + } + // Hand-picked values, also checked with leading zeros. + std::string const allmax(std::size_t(M), digit_to_char(base - 1)); + if (!verify_zeros(maxstr, base) || // largest in-range value + !verify_zeros(increment(maxstr, base), base) || // smallest overflow + !verify_zeros(allmax, base)) { // largest M-digit (multi-wrap) + return EXIT_FAILURE; + } + + // Randomized M-digit values across every leading digit. Bands with + // lead > dmax always overflow (this is where the naive min_safe check + // wrongly accepted multi-wrap values); lead < dmax always fits; lead == + // dmax straddles 2^64 and gets the heaviest sampling. + for (int lead = 1; lead < base; ++lead) { + int const trials = lead == dmax ? 4000 : 300; + for (int trial = 0; trial < trials; ++trial) { + std::string s(1, digit_to_char(lead)); + random_tail(s, M - 1, base); + if (!verify(s, base)) { + return EXIT_FAILURE; + } + } + } + // max_digits-1 digits never overflow; max_digits+1 digits always do. + for (int trial = 0; trial < 500; ++trial) { + std::string shorts(1, + digit_to_char(1 + int(rng() % uint64_t(base - 1)))); + random_tail(shorts, M - 2, base); + std::string longs(1, + digit_to_char(1 + int(rng() % uint64_t(base - 1)))); + random_tail(longs, M, base); + if (!verify(shorts, base) || !verify(longs, base)) { + return EXIT_FAILURE; + } + } + } + if (checked < 100000) { + std::cerr << "overflow sweep ran too few cases: " << checked << std::endl; + return EXIT_FAILURE; + } + } + + // Signed (int64_t) boundary: every value that overflows u64 also overflows + // i64, and the exact i64 limits must parse. Reuses the oracle indirectly via + // hand-built extremes per base. + { + auto digit_to_char = [](int d) -> char { + return d < 10 ? char('0' + d) : char('A' + (d - 10)); + }; + auto to_base_signed = [&](int64_t value, int base) -> std::string { + // value may be INT64_MIN; accumulate magnitude in u64 to avoid UB. + bool const neg = value < 0; + uint64_t mag = neg ? (~uint64_t(value) + 1) : uint64_t(value); + std::string s; + if (mag == 0) { + s = "0"; + } + while (mag != 0) { + s += digit_to_char(int(mag % uint64_t(base))); + mag /= uint64_t(base); + } + if (neg) { + s += '-'; + } + std::reverse(s.begin(), s.end()); + return s; + }; + for (int base = 2; base <= 36; ++base) { + struct { + int64_t v; + } const limits[] = {{INT64_MAX}, {INT64_MIN}, {0}, {-1}, {1}}; + + for (auto const &lim : limits) { + std::string const s = to_base_signed(lim.v, base); + int64_t result = 123; + auto answer = + fast_float::from_chars(s.data(), s.data() + s.size(), result, base); + if (answer.ec != std::errc() || result != lim.v) { + std::cerr << "base " << base << ": signed limit \"" << s + << "\" failed to round-trip (got " << result << ")" + << std::endl; + return EXIT_FAILURE; + } + } + // Increment a non-negative magnitude string (in `base`) by one. + auto inc_mag = [&](std::string m) -> std::string { + int carry = 1; + for (std::size_t k = m.size(); k-- > 0 && carry != 0;) { + int d = (m[k] >= '0' && m[k] <= '9') ? m[k] - '0' + : (m[k] >= 'A' && m[k] <= 'Z') ? m[k] - 'A' + 10 + : m[k] - 'a' + 10; + d += carry; + carry = d / base; + m[k] = digit_to_char(d % base); + } + if (carry != 0) { + m.insert(m.begin(), digit_to_char(carry)); + } + return m; + }; + // INT64_MAX + 1 (= 2^63) overflows a positive int64_t. + // INT64_MIN - 1 (= -(2^63 + 1)) overflows a negative int64_t. + // Note that -(2^63) == INT64_MIN is in range and is covered above. + std::string const max_mag = to_base_signed(INT64_MAX, base); // 2^63 - 1 + std::string const over = inc_mag(max_mag); // 2^63 + std::string const under = "-" + inc_mag(over); // -(2^63 + 1) + for (std::string const &s : {over, under}) { + int64_t result = 123; + auto answer = + fast_float::from_chars(s.data(), s.data() + s.size(), result, base); + if (answer.ec != std::errc::result_out_of_range) { + std::cerr << "base " << base << ": expected result_out_of_range for " + << "signed \"" << s << "\"" << std::endl; + return EXIT_FAILURE; + } + } + } + } + return EXIT_SUCCESS; } #else