From 8e3e876b2e75c42f327ae135d872a2eb0a3f80d2 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 5 Jun 2026 22:11:41 -0400 Subject: [PATCH] Add optional support for digit separators and cpp prefixes (#369) Rebased onto current main. Adds optional support in from_chars_advanced to skip a configurable digit separator (e.g. ') and to skip standard cpp prefixes (0x/0X, 0b/0B) before decimal parsing. Reconciled with main's straight-line-unroll optimization of the integer-part scan: the fast unrolled path and loop_parse_if_eight_digits fast path are preserved for the common no-separator case; separator-aware loops are used only when a digit separator is configured. Original work by zaewc (PR #369), squashed during conflict resolution. --- include/fast_float/ascii_number.h | 172 +++++++++++++++++++++++------- include/fast_float/float_common.h | 14 ++- include/fast_float/parse_number.h | 7 ++ tests/basictest.cpp | 53 ++++++++- 4 files changed, 205 insertions(+), 41 deletions(-) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index 64c3d3f..0459528 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -366,18 +366,20 @@ parse_number_string(UC const *p, UC const *pend, } } UC const *const start_digits = p; + UC const separator = options.digit_separator; + bool const has_separator = (separator != UC('\0')); uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) + int64_t digit_count = 0; + UC const *first_digit_ptr = start_digits; - // Straight-line unroll of the integer-part scan: most integer parts are - // 1-5 digits, so peeling the first iterations eliminates the loop back-edge - // for the common case. Semantics are identical to the original `while` loop: - // i = 10*i + digit, advancing p. - if ((p != pend) && is_integer(*p)) { - i = uint64_t(*p - UC('0')); - ++p; + if (!has_separator) { + // Straight-line unroll of the integer-part scan: most integer parts are + // 1-5 digits, so peeling the first iterations eliminates the loop back-edge + // for the common case. Semantics are identical to the original `while` loop: + // i = 10*i + digit, advancing p. if ((p != pend) && is_integer(*p)) { - i = 10 * i + uint64_t(*p - UC('0')); + i = uint64_t(*p - UC('0')); ++p; if ((p != pend) && is_integer(*p)) { i = 10 * i + uint64_t(*p - UC('0')); @@ -388,27 +390,55 @@ parse_number_string(UC const *p, UC const *pend, if ((p != pend) && is_integer(*p)) { i = 10 * i + uint64_t(*p - UC('0')); ++p; - while ((p != pend) && is_integer(*p)) { - // a multiplication by 10 is cheaper than an arbitrary integer - // multiplication - i = 10 * i + - uint64_t(*p - UC('0')); // might overflow, handled later + if ((p != pend) && is_integer(*p)) { + i = 10 * i + uint64_t(*p - UC('0')); ++p; + while ((p != pend) && is_integer(*p)) { + // a multiplication by 10 is cheaper than an arbitrary integer + // multiplication + i = 10 * i + + uint64_t(*p - UC('0')); // might overflow, handled later + ++p; + } } } } } } + digit_count = int64_t(p - start_digits); + } else { + // Separator-aware scan: a configured digit separator (e.g. '\'') may appear + // between digits. It is skipped and does not contribute to the value or the + // digit count, but it is retained in the integer span below. + while (p != pend) { + if (*p == separator) { + ++p; + continue; + } + if (!is_integer(*p)) { + break; + } + if (digit_count == 0) { + first_digit_ptr = p; + } + // a multiplication by 10 is cheaper than an arbitrary integer + // multiplication + i = 10 * i + uint64_t(*p - UC('0')); // might overflow, handled later + ++p; + ++digit_count; + } } UC const *const end_of_integer_part = p; - int64_t digit_count = int64_t(end_of_integer_part - start_digits); - answer.integer = span(start_digits, size_t(digit_count)); + // The span keeps the raw characters (separators included) so the overflow + // re-scan below can re-tokenize correctly. + answer.integer = + span(start_digits, size_t(end_of_integer_part - start_digits)); FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) { // at least 1 digit in integer part, without leading zeros if (digit_count == 0) { return report_parse_error(p, parse_error::no_digits_in_integer_part); } - if ((start_digits[0] == UC('0') && digit_count > 1)) { + if (digit_count > 1 && *first_digit_ptr == UC('0')) { return report_parse_error(start_digits, parse_error::leading_zeros_in_integer_part); } @@ -419,18 +449,37 @@ parse_number_string(UC const *p, UC const *pend, if (has_decimal_point) { ++p; UC const *before = p; - // can occur at most twice without overflowing, but let it occur more, since - // for integers with many digits, digit parsing is the primary bottleneck. - loop_parse_if_eight_digits(p, pend, i); + int64_t fractional_digit_count = 0; + if (!has_separator) { + // can occur at most twice without overflowing, but let it occur more, + // since for integers with many digits, digit parsing is the primary + // bottleneck. + loop_parse_if_eight_digits(p, pend, i); - while ((p != pend) && is_integer(*p)) { - uint8_t digit = uint8_t(*p - UC('0')); - ++p; - i = i * 10 + digit; // in rare cases, this will overflow, but that's ok + while ((p != pend) && is_integer(*p)) { + uint8_t digit = uint8_t(*p - UC('0')); + ++p; + i = i * 10 + digit; // in rare cases, this will overflow, but that's ok + } + fractional_digit_count = int64_t(p - before); + } else { + while (p != pend) { + if (*p == separator) { + ++p; + continue; + } + if (!is_integer(*p)) { + break; + } + uint8_t digit = uint8_t(*p - UC('0')); + ++p; + i = i * 10 + digit; // in rare cases, this will overflow, but that's ok + ++fractional_digit_count; + } } - exponent = before - p; + exponent = -fractional_digit_count; answer.fraction = span(before, size_t(p - before)); - digit_count -= exponent; + digit_count += fractional_digit_count; } FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) { // at least 1 digit in fractional part @@ -472,7 +521,14 @@ parse_number_string(UC const *p, UC const *pend, // Otherwise, we will be ignoring the 'e'. p = location_of_e; } else { - while ((p != pend) && is_integer(*p)) { + while (p != pend) { + if (has_separator && *p == separator) { + ++p; + continue; + } + if (!is_integer(*p)) { + break; + } uint8_t digit = uint8_t(*p - UC('0')); if (exp_number < 0x10000000) { exp_number = 10 * exp_number + digit; @@ -505,7 +561,8 @@ parse_number_string(UC const *p, UC const *pend, // We need to be mindful of the case where we only have zeroes... // E.g., 0.000000000...000. UC const *start = start_digits; - while ((start != pend) && (*start == UC('0') || *start == decimal_point)) { + while ((start != pend) && (*start == UC('0') || *start == decimal_point || + (has_separator && *start == separator))) { if (*start == UC('0')) { digit_count--; } @@ -521,20 +578,59 @@ parse_number_string(UC const *p, UC const *pend, p = answer.integer.ptr; UC const *int_end = p + answer.integer.len(); uint64_t const minimal_nineteen_digit_integer{1000000000000000000}; - while ((i < minimal_nineteen_digit_integer) && (p != int_end)) { - i = i * 10 + uint64_t(*p - UC('0')); - ++p; - } - if (i >= minimal_nineteen_digit_integer) { // We have a big integer - exponent = end_of_integer_part - p + exp_number; - } else { // We have a value with a fractional component. - p = answer.fraction.ptr; - UC const *frac_end = p + answer.fraction.len(); - while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) { + if (!has_separator) { + while ((i < minimal_nineteen_digit_integer) && (p != int_end)) { i = i * 10 + uint64_t(*p - UC('0')); ++p; } - exponent = answer.fraction.ptr - p + exp_number; + if (i >= minimal_nineteen_digit_integer) { // We have a big integer + exponent = end_of_integer_part - p + exp_number; + } else { // We have a value with a fractional component. + p = answer.fraction.ptr; + UC const *frac_end = p + answer.fraction.len(); + while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) { + i = i * 10 + uint64_t(*p - UC('0')); + ++p; + } + exponent = answer.fraction.ptr - p + exp_number; + } + } else { + // Separator-aware re-scan: separators are skipped and excluded from the + // digit counts that determine the exponent. + while ((i < minimal_nineteen_digit_integer) && (p != int_end)) { + if (*p == separator) { + ++p; + continue; + } + i = i * 10 + uint64_t(*p - UC('0')); + ++p; + } + if (i >= minimal_nineteen_digit_integer) { // We have a big integer + int64_t remaining_integer_digits = 0; + while (p != int_end) { + if (*p == separator) { + ++p; + continue; + } + ++p; + ++remaining_integer_digits; + } + exponent = remaining_integer_digits + exp_number; + } else { // We have a value with a fractional component. + p = answer.fraction.ptr; + UC const *frac_end = p + answer.fraction.len(); + int64_t fraction_digits_consumed = 0; + while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) { + if (*p == separator) { + ++p; + continue; + } + i = i * 10 + uint64_t(*p - UC('0')); + ++p; + ++fraction_digits_consumed; + } + exponent = exp_number - fraction_digits_consumed; + } } // We have now corrected both exponent and i, to a truncated value } diff --git a/include/fast_float/float_common.h b/include/fast_float/float_common.h index 3e91c57..6f4435b 100644 --- a/include/fast_float/float_common.h +++ b/include/fast_float/float_common.h @@ -70,8 +70,10 @@ using from_chars_result = from_chars_result_t; template struct parse_options_t { constexpr explicit parse_options_t(chars_format fmt = chars_format::general, - UC dot = UC('.'), int b = 10) - : format(fmt), decimal_point(dot), base(b) {} + UC dot = UC('.'), int b = 10, + UC sep = UC('\0'), uint8_t opts = 0) + : format(fmt), decimal_point(dot), base(b), digit_separator(sep), + format_options(opts) {} /** Which number formats are accepted */ chars_format format; @@ -79,6 +81,14 @@ template struct parse_options_t { UC decimal_point; /** The base used for integers */ int base; + /** The character used as digit separator. Use '\0' to + * disable */ + UC digit_separator; + /** Additional format options (bitmask) */ + uint8_t format_options; + + /** Option to skip prefixes like 0x, 0b */ + static constexpr uint8_t skip_prefix = 1; }; using parse_options = parse_options_t; diff --git a/include/fast_float/parse_number.h b/include/fast_float/parse_number.h index ff9c53d..a520db1 100644 --- a/include/fast_float/parse_number.h +++ b/include/fast_float/parse_number.h @@ -476,6 +476,13 @@ template fastfloat_really_inline FASTFLOAT_CONSTEXPR20 from_chars_result_t from_chars_advanced(UC const *first, UC const *last, T &value, parse_options_t options) noexcept { + if (((options.format_options & parse_options_t::skip_prefix) != 0) && + (last - first >= 2) && (*first == UC('0'))) { + const UC c_low = UC(first[1] | UC(0x20)); + if (c_low == UC('x') || c_low == UC('b')) { + first += 2; + } + } return from_chars_advanced_caller< size_t(is_supported_float_type::value) + 2 * size_t(is_supported_integer_type::value)>::call(first, last, value, diff --git a/tests/basictest.cpp b/tests/basictest.cpp index dba36e8..8fbd680 100644 --- a/tests/basictest.cpp +++ b/tests/basictest.cpp @@ -681,6 +681,57 @@ TEST_CASE("decimal_point_parsing") { } } +TEST_CASE("digit_separator") { + double result; + fast_float::parse_options options{}; + options.digit_separator = '_'; + { + std::string const input = "1_000"; + auto answer = fast_float::from_chars_advanced( + input.data(), input.data() + input.size(), result, options); + CHECK_MESSAGE(answer.ec == std::errc(), "expected parse success"); + CHECK_MESSAGE(answer.ptr == input.data() + input.size(), + "Parsing should have stopped at end"); + CHECK_EQ(result, 1000.0); + } + { + std::string const input = "1.00_5"; + auto answer = fast_float::from_chars_advanced( + input.data(), input.data() + input.size(), result, options); + CHECK_MESSAGE(answer.ec == std::errc(), "expected parse success"); + CHECK_MESSAGE(answer.ptr == input.data() + input.size(), + "Parsing should have stopped at end"); + CHECK_EQ(result, 1.005); + } + { + std::string const input = "1e1_0"; + auto answer = fast_float::from_chars_advanced( + input.data(), input.data() + input.size(), result, options); + CHECK_MESSAGE(answer.ec == std::errc(), "expected parse success"); + CHECK_MESSAGE(answer.ptr == input.data() + input.size(), + "Parsing should have stopped at end"); + CHECK_EQ(result, 1e10); + } + { + std::string const input = "1_5e1_2"; + auto answer = fast_float::from_chars_advanced( + input.data(), input.data() + input.size(), result, options); + CHECK_MESSAGE(answer.ec == std::errc(), "expected parse success"); + CHECK_MESSAGE(answer.ptr == input.data() + input.size(), + "Parsing should have stopped at end"); + CHECK_EQ(result, 15e12); + } + { + std::string const input = "1_5.0_5e1_2"; + auto answer = fast_float::from_chars_advanced( + input.data(), input.data() + input.size(), result, options); + CHECK_MESSAGE(answer.ec == std::errc(), "expected parse success"); + CHECK_MESSAGE(answer.ptr == input.data() + input.size(), + "Parsing should have stopped at end"); + CHECK_EQ(result, 15.05e12); + } +} + TEST_CASE("issue19") { std::string const input = "234532.3426362,7869234.9823,324562.645"; double result; @@ -2452,4 +2503,4 @@ TEST_CASE("integer_times_pow10") { all::verify_integer_times_pow10(std::numeric_limits::max(), 42); all::verify_integer_times_pow10(std::numeric_limits::max(), -42); } -} \ No newline at end of file +}