Skip materializing the integer/fraction spans on the hot path

parsed_number_string_t carries two span<UC const> members (integer, fraction) that are only read on the rare slow paths (digit_comp, and the >19-significant- digit truncation recompute). Materializing them on every parse forces the ~56/64- byte struct to be written out and marshaled through the by-value return, which shows up as backend/store pressure on the hot path. This adds a runtime `store_spans` flag (default true, so all existing callers are unchanged) to parse_number_string; from_chars_float_advanced parses with it false, attempts the Clinger and Eisel-Lemire fast paths inline, and only re-parses with spans on the two rare slow branches. The re-parse is pushed into a single `fastfloat_noinline` (noinline+cold) helper so the force-inlined hot scanner is emitted once rather than duplicated into the caller (without this the extra inline copies regress some targets, e.g. ARM gcc, by bloating the hot frame and lengthening the loop-carried dependency chain). A runtime flag is used deliberately rather than a template parameter: a template would create a second instantiation of the whole scanner whose icache cost wipes out the gain. Measured (per-parser microbench, median of 5, pinned core), fast_float from_chars <double>/<float>, vs the current tip: - Intel Ice Lake (Xeon 8360Y): +17-19% (gcc), Intel TMA shows backend-bound 26.0% -> 2.2% and retiring 60.3% -> 77.3% on short floats (the eliminated span spill), with -36% pipeline slots. - Intel Cascade Lake (Xeon 6248): +18-22% (gcc), +13-23% (clang). - ARM Neoverse-V2 (Graviton4): +73-196% (gcc), +8-11% (clang) -- the struct spill dominated the gcc hot loop there. Correctness: the full float exhaustive suite (exhaustive32, exhaustive32_64, exhaustive32_midpoint, random64) passes, and a 2^32 sweep is byte-identical to the current tip. Public from_chars / from_chars_advanced / parsed_number_string_t are unchanged.
2026-07-30 08:16:25 +08:00 · 2026-06-03 09:30:42 +01:00 · 2026-06-03 09:30:42 +01:00 · cb5d9cd9a4
commit cb5d9cd9a4
parent 6258cbc5a1
3 changed files with 104 additions and 27 deletions
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@ -330,10 +330,18 @@ report_parse_error(UC const *p, parse_error error) {

 // Assuming that you use no more than 19 digits, this will
 // parse an ASCII string.
+//
+// store_spans is a *runtime* flag (not a template parameter, deliberately: a
+// template would create a second instantiation of this whole function and the
+// extra icache pressure wipes out the gain). When false, the integer/fraction
+// spans (read only by the rare digit_comp slow path) are not materialized, which
+// keeps the fat parsed_number_string_t off the hot path. The caller re-parses
+// with store_spans=true if the slow path is actually reached.
 template <bool basic_json_fmt, typename UC>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t<UC>
 parse_number_string(UC const *p, UC const *pend,
-                    parse_options_t<UC> options) noexcept {
+                    parse_options_t<UC> options,
+                    bool store_spans = true) noexcept {
  chars_format const fmt = detail::adjust_for_feature_macros(options.format);
  UC const decimal_point = options.decimal_point;

@ -402,7 +410,9 @@ parse_number_string(UC const *p, UC const *pend,
  }
  UC const *const end_of_integer_part = p;
  int64_t digit_count = int64_t(end_of_integer_part - start_digits);
-  answer.integer = span<UC const>(start_digits, size_t(digit_count));
+  if (store_spans) {
+    answer.integer = span<UC const>(start_digits, size_t(digit_count));
+  }
  FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) {
    // at least 1 digit in integer part, without leading zeros
    if (digit_count == 0) {
@ -429,7 +439,9 @@ parse_number_string(UC const *p, UC const *pend,
      i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
    }
    exponent = before - p;
-    answer.fraction = span<UC const>(before, size_t(p - before));
+    if (store_spans) {
+      answer.fraction = span<UC const>(before, size_t(p - before));
+    }
    digit_count -= exponent;
  }
  FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) {
@ -514,29 +526,35 @@ parse_number_string(UC const *p, UC const *pend,

    if (digit_count > 19) {
      answer.too_many_digits = true;
-      // Let us start again, this time, avoiding overflows.
-      // We don't need to call if is_integer, since we use the
-      // pre-tokenized spans from above.
-      i = 0;
-      p = answer.integer.ptr;
-      UC const *int_end = p + answer.integer.len();
-      uint64_t const minimal_nineteen_digit_integer{1000000000000000000};
-      while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
-        i = i * 10 + uint64_t(*p - UC('0'));
-        ++p;
-      }
-      if (i >= minimal_nineteen_digit_integer) { // We have a big integer
-        exponent = end_of_integer_part - p + exp_number;
-      } else { // We have a value with a fractional component.
-        p = answer.fraction.ptr;
-        UC const *frac_end = p + answer.fraction.len();
-        while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
+      // The truncation recompute below reads the integer/fraction spans. When
+      // store_spans is false we didn't materialize them, so just flag
+      // too_many_digits; the caller re-parses with store_spans=true to obtain
+      // the corrected mantissa/exponent before taking the slow path.
+      if (store_spans) {
+        // Let us start again, this time, avoiding overflows.
+        // We don't need to call if is_integer, since we use the
+        // pre-tokenized spans from above.
+        i = 0;
+        p = answer.integer.ptr;
+        UC const *int_end = p + answer.integer.len();
+        uint64_t const minimal_nineteen_digit_integer{1000000000000000000};
+        while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
          i = i * 10 + uint64_t(*p - UC('0'));
          ++p;
        }
-        exponent = answer.fraction.ptr - p + exp_number;
+        if (i >= minimal_nineteen_digit_integer) { // We have a big integer
+          exponent = end_of_integer_part - p + exp_number;
+        } else { // We have a value with a fractional component.
+          p = answer.fraction.ptr;
+          UC const *frac_end = p + answer.fraction.len();
+          while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
+            i = i * 10 + uint64_t(*p - UC('0'));
+            ++p;
+          }
+          exponent = answer.fraction.ptr - p + exp_number;
+        }
+        // We have now corrected both exponent and i, to a truncated value
      }
-      // We have now corrected both exponent and i, to a truncated value
    }
  }
  answer.exponent = exponent;
--- a/include/fast_float/float_common.h
+++ b/include/fast_float/float_common.h
@ -197,6 +197,15 @@ using parse_options = parse_options_t<char>;
 #define fastfloat_really_inline inline __attribute__((always_inline))
 #endif

+// Force a function OUT of line and onto the cold path. Used for the rare
+// slow-path re-parse so the force-inlined hot scanner is not duplicated into
+// the caller (which bloated the hot frame and hurt ILP on some targets).
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#define fastfloat_noinline __declspec(noinline)
+#else
+#define fastfloat_noinline __attribute__((noinline, cold))
+#endif
+
 #ifndef FASTFLOAT_ASSERT
 #define FASTFLOAT_ASSERT(x)                                                    \
  { ((void)(x)); }
--- a/include/fast_float/parse_number.h
+++ b/include/fast_float/parse_number.h
@ -289,6 +289,23 @@ from_chars_advanced(parsed_number_string_t<UC> &pns, T &value) noexcept {
  return answer;
 }

+// Cold, out-of-line slow path: re-parse materializing the integer/fraction
+// spans the hot no-span parse skipped, then run the full algorithm. Marked
+// noinline+cold so the force-inlined spans scanner is emitted ONCE off the hot
+// path rather than duplicated into from_chars_float_advanced (which bloated the
+// hot frame). from_chars_advanced already handles both the too_many_digits
+// disambiguation and the am.power2<0 digit_comp recompute, so both slow branches
+// collapse to one helper call.
+template <typename T, typename UC>
+fastfloat_noinline FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+parse_number_slow_path(UC const *first, UC const *last, T &value,
+                       parse_options_t<UC> options, bool bjf) noexcept {
+  parsed_number_string_t<UC> pns =
+      bjf ? parse_number_string<true, UC>(first, last, options, true)
+          : parse_number_string<false, UC>(first, last, options, true);
+  return from_chars_advanced(pns, value);
+}
+
 template <typename T, typename UC>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
 from_chars_float_advanced(UC const *first, UC const *last, T &value,
@ -312,10 +329,15 @@ from_chars_float_advanced(UC const *first, UC const *last, T &value,
    answer.ptr = first;
    return answer;
  }
+  bool const bjf = uint64_t(fmt & detail::basic_json_fmt) != 0;
+
+  // Fast path: parse WITHOUT materializing the integer/fraction spans (read only
+  // by the rare slow paths). Skipping their stores keeps the fat
+  // parsed_number_string_t off the hot path. store_spans is a runtime argument,
+  // so this reuses the single parse_number_string instantiation.
  parsed_number_string_t<UC> pns =
-      uint64_t(fmt & detail::basic_json_fmt)
-          ? parse_number_string<true, UC>(first, last, options)
-          : parse_number_string<false, UC>(first, last, options);
+      bjf ? parse_number_string<true, UC>(first, last, options, false)
+          : parse_number_string<false, UC>(first, last, options, false);
  if (!pns.valid) {
    if (uint64_t(fmt & chars_format::no_infnan)) {
      answer.ec = std::errc::invalid_argument;
@ -326,8 +348,36 @@ from_chars_float_advanced(UC const *first, UC const *last, T &value,
    }
  }

-  // call overload that takes parsed_number_string_t directly.
-  return from_chars_advanced(pns, value);
+  // Slow path A (rare): > 19 significant digits. The no-span parse left the
+  // mantissa un-truncated and skipped the span-based recompute; the cold helper
+  // re-parses with spans and runs the full algorithm.
+  if (pns.too_many_digits) {
+    return parse_number_slow_path<T, UC>(first, last, value, options, bjf);
+  }
+
+  answer.ec = std::errc(); // be optimistic
+  answer.ptr = pns.lastmatch;
+
+  if (clinger_fast_path_impl(pns.mantissa, pns.exponent, pns.negative, value)) {
+    return answer;
+  }
+
+  adjusted_mantissa am =
+      compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
+  // Slow path B (rare): Eisel-Lemire could not resolve; digit_comp needs the
+  // integer/fraction spans. Route to the cold helper (clinger there is a
+  // dead-effect since it already failed here; the cold re-parse + digit_comp via
+  // from_chars_advanced reproduces this branch).
+  if (am.power2 < 0) {
+    return parse_number_slow_path<T, UC>(first, last, value, options, bjf);
+  }
+  to_float(pns.negative, am, value);
+  // Test for over/underflow.
+  if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) ||
+      am.power2 == binary_format<T>::infinite_power()) {
+    answer.ec = std::errc::result_out_of_range;
+  }
+  return answer;
 }

 template <typename T, typename UC, typename>