From 8e3e876b2e75c42f327ae135d872a2eb0a3f80d2 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Fri, 5 Jun 2026 22:11:41 -0400
Subject: [PATCH] Add optional support for digit separators and cpp prefixes
 (#369)

Rebased onto current main. Adds optional support in from_chars_advanced to
skip a configurable digit separator (e.g. ') and to skip standard cpp prefixes
(0x/0X, 0b/0B) before decimal parsing.

Reconciled with main's straight-line-unroll optimization of the integer-part
scan: the fast unrolled path and loop_parse_if_eight_digits fast path are
preserved for the common no-separator case; separator-aware loops are used only
when a digit separator is configured.

Original work by zaewc (PR #369), squashed during conflict resolution.
---
 include/fast_float/ascii_number.h | 172 +++++++++++++++++++++++-------
 include/fast_float/float_common.h |  14 ++-
 include/fast_float/parse_number.h |   7 ++
 tests/basictest.cpp               |  53 ++++++++-
 4 files changed, 205 insertions(+), 41 deletions(-)

diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
index 64c3d3f..0459528 100644
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@@ -366,18 +366,20 @@ parse_number_string(UC const *p, UC const *pend,
     }
   }
   UC const *const start_digits = p;
+  UC const separator = options.digit_separator;
+  bool const has_separator = (separator != UC('\0'));
 
   uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
+  int64_t digit_count = 0;
+  UC const *first_digit_ptr = start_digits;
 
-  // Straight-line unroll of the integer-part scan: most integer parts are
-  // 1-5 digits, so peeling the first iterations eliminates the loop back-edge
-  // for the common case. Semantics are identical to the original `while` loop:
-  // i = 10*i + digit, advancing p.
-  if ((p != pend) && is_integer(*p)) {
-    i = uint64_t(*p - UC('0'));
-    ++p;
+  if (!has_separator) {
+    // Straight-line unroll of the integer-part scan: most integer parts are
+    // 1-5 digits, so peeling the first iterations eliminates the loop back-edge
+    // for the common case. Semantics are identical to the original `while` loop:
+    // i = 10*i + digit, advancing p.
     if ((p != pend) && is_integer(*p)) {
-      i = 10 * i + uint64_t(*p - UC('0'));
+      i = uint64_t(*p - UC('0'));
       ++p;
       if ((p != pend) && is_integer(*p)) {
         i = 10 * i + uint64_t(*p - UC('0'));
@@ -388,27 +390,55 @@ parse_number_string(UC const *p, UC const *pend,
           if ((p != pend) && is_integer(*p)) {
             i = 10 * i + uint64_t(*p - UC('0'));
             ++p;
-            while ((p != pend) && is_integer(*p)) {
-              // a multiplication by 10 is cheaper than an arbitrary integer
-              // multiplication
-              i = 10 * i +
-                  uint64_t(*p - UC('0')); // might overflow, handled later
+            if ((p != pend) && is_integer(*p)) {
+              i = 10 * i + uint64_t(*p - UC('0'));
               ++p;
+              while ((p != pend) && is_integer(*p)) {
+                // a multiplication by 10 is cheaper than an arbitrary integer
+                // multiplication
+                i = 10 * i +
+                    uint64_t(*p - UC('0')); // might overflow, handled later
+                ++p;
+              }
             }
           }
         }
       }
     }
+    digit_count = int64_t(p - start_digits);
+  } else {
+    // Separator-aware scan: a configured digit separator (e.g. '\'') may appear
+    // between digits. It is skipped and does not contribute to the value or the
+    // digit count, but it is retained in the integer span below.
+    while (p != pend) {
+      if (*p == separator) {
+        ++p;
+        continue;
+      }
+      if (!is_integer(*p)) {
+        break;
+      }
+      if (digit_count == 0) {
+        first_digit_ptr = p;
+      }
+      // a multiplication by 10 is cheaper than an arbitrary integer
+      // multiplication
+      i = 10 * i + uint64_t(*p - UC('0')); // might overflow, handled later
+      ++p;
+      ++digit_count;
+    }
   }
   UC const *const end_of_integer_part = p;
-  int64_t digit_count = int64_t(end_of_integer_part - start_digits);
-  answer.integer = span<UC const>(start_digits, size_t(digit_count));
+  // The span keeps the raw characters (separators included) so the overflow
+  // re-scan below can re-tokenize correctly.
+  answer.integer =
+      span<UC const>(start_digits, size_t(end_of_integer_part - start_digits));
   FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) {
     // at least 1 digit in integer part, without leading zeros
     if (digit_count == 0) {
       return report_parse_error<UC>(p, parse_error::no_digits_in_integer_part);
     }
-    if ((start_digits[0] == UC('0') && digit_count > 1)) {
+    if (digit_count > 1 && *first_digit_ptr == UC('0')) {
       return report_parse_error<UC>(start_digits,
                                     parse_error::leading_zeros_in_integer_part);
     }
@@ -419,18 +449,37 @@ parse_number_string(UC const *p, UC const *pend,
   if (has_decimal_point) {
     ++p;
     UC const *before = p;
-    // can occur at most twice without overflowing, but let it occur more, since
-    // for integers with many digits, digit parsing is the primary bottleneck.
-    loop_parse_if_eight_digits(p, pend, i);
+    int64_t fractional_digit_count = 0;
+    if (!has_separator) {
+      // can occur at most twice without overflowing, but let it occur more,
+      // since for integers with many digits, digit parsing is the primary
+      // bottleneck.
+      loop_parse_if_eight_digits(p, pend, i);
 
-    while ((p != pend) && is_integer(*p)) {
-      uint8_t digit = uint8_t(*p - UC('0'));
-      ++p;
-      i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+      while ((p != pend) && is_integer(*p)) {
+        uint8_t digit = uint8_t(*p - UC('0'));
+        ++p;
+        i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+      }
+      fractional_digit_count = int64_t(p - before);
+    } else {
+      while (p != pend) {
+        if (*p == separator) {
+          ++p;
+          continue;
+        }
+        if (!is_integer(*p)) {
+          break;
+        }
+        uint8_t digit = uint8_t(*p - UC('0'));
+        ++p;
+        i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+        ++fractional_digit_count;
+      }
     }
-    exponent = before - p;
+    exponent = -fractional_digit_count;
     answer.fraction = span<UC const>(before, size_t(p - before));
-    digit_count -= exponent;
+    digit_count += fractional_digit_count;
   }
   FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) {
     // at least 1 digit in fractional part
@@ -472,7 +521,14 @@ parse_number_string(UC const *p, UC const *pend,
       // Otherwise, we will be ignoring the 'e'.
       p = location_of_e;
     } else {
-      while ((p != pend) && is_integer(*p)) {
+      while (p != pend) {
+        if (has_separator && *p == separator) {
+          ++p;
+          continue;
+        }
+        if (!is_integer(*p)) {
+          break;
+        }
         uint8_t digit = uint8_t(*p - UC('0'));
         if (exp_number < 0x10000000) {
           exp_number = 10 * exp_number + digit;
@@ -505,7 +561,8 @@ parse_number_string(UC const *p, UC const *pend,
     // We need to be mindful of the case where we only have zeroes...
     // E.g., 0.000000000...000.
     UC const *start = start_digits;
-    while ((start != pend) && (*start == UC('0') || *start == decimal_point)) {
+    while ((start != pend) && (*start == UC('0') || *start == decimal_point ||
+                               (has_separator && *start == separator))) {
       if (*start == UC('0')) {
         digit_count--;
       }
@@ -521,20 +578,59 @@ parse_number_string(UC const *p, UC const *pend,
       p = answer.integer.ptr;
       UC const *int_end = p + answer.integer.len();
       uint64_t const minimal_nineteen_digit_integer{1000000000000000000};
-      while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
-        i = i * 10 + uint64_t(*p - UC('0'));
-        ++p;
-      }
-      if (i >= minimal_nineteen_digit_integer) { // We have a big integer
-        exponent = end_of_integer_part - p + exp_number;
-      } else { // We have a value with a fractional component.
-        p = answer.fraction.ptr;
-        UC const *frac_end = p + answer.fraction.len();
-        while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
+      if (!has_separator) {
+        while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
           i = i * 10 + uint64_t(*p - UC('0'));
           ++p;
         }
-        exponent = answer.fraction.ptr - p + exp_number;
+        if (i >= minimal_nineteen_digit_integer) { // We have a big integer
+          exponent = end_of_integer_part - p + exp_number;
+        } else { // We have a value with a fractional component.
+          p = answer.fraction.ptr;
+          UC const *frac_end = p + answer.fraction.len();
+          while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
+            i = i * 10 + uint64_t(*p - UC('0'));
+            ++p;
+          }
+          exponent = answer.fraction.ptr - p + exp_number;
+        }
+      } else {
+        // Separator-aware re-scan: separators are skipped and excluded from the
+        // digit counts that determine the exponent.
+        while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
+          if (*p == separator) {
+            ++p;
+            continue;
+          }
+          i = i * 10 + uint64_t(*p - UC('0'));
+          ++p;
+        }
+        if (i >= minimal_nineteen_digit_integer) { // We have a big integer
+          int64_t remaining_integer_digits = 0;
+          while (p != int_end) {
+            if (*p == separator) {
+              ++p;
+              continue;
+            }
+            ++p;
+            ++remaining_integer_digits;
+          }
+          exponent = remaining_integer_digits + exp_number;
+        } else { // We have a value with a fractional component.
+          p = answer.fraction.ptr;
+          UC const *frac_end = p + answer.fraction.len();
+          int64_t fraction_digits_consumed = 0;
+          while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
+            if (*p == separator) {
+              ++p;
+              continue;
+            }
+            i = i * 10 + uint64_t(*p - UC('0'));
+            ++p;
+            ++fraction_digits_consumed;
+          }
+          exponent = exp_number - fraction_digits_consumed;
+        }
       }
       // We have now corrected both exponent and i, to a truncated value
     }
diff --git a/include/fast_float/float_common.h b/include/fast_float/float_common.h
index 3e91c57..6f4435b 100644
--- a/include/fast_float/float_common.h
+++ b/include/fast_float/float_common.h
@@ -70,8 +70,10 @@ using from_chars_result = from_chars_result_t<char>;
 
 template <typename UC> struct parse_options_t {
   constexpr explicit parse_options_t(chars_format fmt = chars_format::general,
-                                     UC dot = UC('.'), int b = 10)
-      : format(fmt), decimal_point(dot), base(b) {}
+                                     UC dot = UC('.'), int b = 10,
+                                     UC sep = UC('\0'), uint8_t opts = 0)
+      : format(fmt), decimal_point(dot), base(b), digit_separator(sep),
+        format_options(opts) {}
 
   /** Which number formats are accepted */
   chars_format format;
@@ -79,6 +81,14 @@ template <typename UC> struct parse_options_t {
   UC decimal_point;
   /** The base used for integers */
   int base;
+  /** The character used as digit separator. Use '\0' to
+   * disable */
+  UC digit_separator;
+  /** Additional format options (bitmask) */
+  uint8_t format_options;
+
+  /** Option to skip prefixes like 0x, 0b */
+  static constexpr uint8_t skip_prefix = 1;
 };
 
 using parse_options = parse_options_t<char>;
diff --git a/include/fast_float/parse_number.h b/include/fast_float/parse_number.h
index ff9c53d..a520db1 100644
--- a/include/fast_float/parse_number.h
+++ b/include/fast_float/parse_number.h
@@ -476,6 +476,13 @@ template <typename T, typename UC>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
 from_chars_advanced(UC const *first, UC const *last, T &value,
                     parse_options_t<UC> options) noexcept {
+  if (((options.format_options & parse_options_t<UC>::skip_prefix) != 0) &&
+      (last - first >= 2) && (*first == UC('0'))) {
+    const UC c_low = UC(first[1] | UC(0x20));
+    if (c_low == UC('x') || c_low == UC('b')) {
+      first += 2;
+    }
+  }
   return from_chars_advanced_caller<
       size_t(is_supported_float_type<T>::value) +
       2 * size_t(is_supported_integer_type<T>::value)>::call(first, last, value,
diff --git a/tests/basictest.cpp b/tests/basictest.cpp
index dba36e8..8fbd680 100644
--- a/tests/basictest.cpp
+++ b/tests/basictest.cpp
@@ -681,6 +681,57 @@ TEST_CASE("decimal_point_parsing") {
   }
 }
 
+TEST_CASE("digit_separator") {
+  double result;
+  fast_float::parse_options options{};
+  options.digit_separator = '_';
+  {
+    std::string const input = "1_000";
+    auto answer = fast_float::from_chars_advanced(
+        input.data(), input.data() + input.size(), result, options);
+    CHECK_MESSAGE(answer.ec == std::errc(), "expected parse success");
+    CHECK_MESSAGE(answer.ptr == input.data() + input.size(),
+                  "Parsing should have stopped at end");
+    CHECK_EQ(result, 1000.0);
+  }
+  {
+    std::string const input = "1.00_5";
+    auto answer = fast_float::from_chars_advanced(
+        input.data(), input.data() + input.size(), result, options);
+    CHECK_MESSAGE(answer.ec == std::errc(), "expected parse success");
+    CHECK_MESSAGE(answer.ptr == input.data() + input.size(),
+                  "Parsing should have stopped at end");
+    CHECK_EQ(result, 1.005);
+  }
+  {
+    std::string const input = "1e1_0";
+    auto answer = fast_float::from_chars_advanced(
+        input.data(), input.data() + input.size(), result, options);
+    CHECK_MESSAGE(answer.ec == std::errc(), "expected parse success");
+    CHECK_MESSAGE(answer.ptr == input.data() + input.size(),
+                  "Parsing should have stopped at end");
+    CHECK_EQ(result, 1e10);
+  }
+  {
+    std::string const input = "1_5e1_2";
+    auto answer = fast_float::from_chars_advanced(
+        input.data(), input.data() + input.size(), result, options);
+    CHECK_MESSAGE(answer.ec == std::errc(), "expected parse success");
+    CHECK_MESSAGE(answer.ptr == input.data() + input.size(),
+                  "Parsing should have stopped at end");
+    CHECK_EQ(result, 15e12);
+  }
+  {
+    std::string const input = "1_5.0_5e1_2";
+    auto answer = fast_float::from_chars_advanced(
+        input.data(), input.data() + input.size(), result, options);
+    CHECK_MESSAGE(answer.ec == std::errc(), "expected parse success");
+    CHECK_MESSAGE(answer.ptr == input.data() + input.size(),
+                  "Parsing should have stopped at end");
+    CHECK_EQ(result, 15.05e12);
+  }
+}
+
 TEST_CASE("issue19") {
   std::string const input = "234532.3426362,7869234.9823,324562.645";
   double result;
@@ -2452,4 +2503,4 @@ TEST_CASE("integer_times_pow10") {
     all::verify_integer_times_pow10(std::numeric_limits<uint64_t>::max(), 42);
     all::verify_integer_times_pow10(std::numeric_limits<uint64_t>::max(), -42);
   }
-}
\ No newline at end of file
+}