fixes and cleanup for the parse_number_string function.

exponent value is always less than in16_t. original main: Tests: time is: 44278ms. size of my tests 389.0k size of my program 164.0k my main: Tests: time is: 42015ms. size of my tests 389.0k size of my program 164.0k my main with FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN Tests: time is: 41282ms. size of my tests 386.5k size of my program 161.5k After this I'll try it on my partner Linux machine with the original tests and compare much better.
2025-12-06 16:56:57 +08:00 · 2025-04-10 17:18:08 +03:00 · 2025-04-10 17:18:08 +03:00 · 8e1fda5d08
commit 8e1fda5d08
parent 2da25b51c8
6 changed files with 143 additions and 138 deletions
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@ -50,7 +50,7 @@ fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
 read8_to_u64(UC const *chars) {
  if (cpp20_and_in_constexpr() || !std::is_same<UC, char>::value) {
    uint64_t val = 0;
-    for (int i = 0; i < 8; ++i) {
+    for (uint8_t i = 0; i != 8; ++i) {
      val |= uint64_t(uint8_t(*chars)) << (i * 8);
      ++chars;
    }
@ -261,7 +261,7 @@ enum class parse_error {

 template <typename UC> struct parsed_number_string_t {
  uint64_t mantissa{0};
-  int32_t exponent{0};
+  int16_t exponent{0};
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
  bool negative{false};
 #endif
@ -327,18 +327,17 @@ parse_number_string(UC const *p, UC const *pend,

  UC const *const start_digits = p;

-  uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
-
+  // an unsigned int avoids signed overflows (which are bad)
  while ((p != pend) && is_integer(*p)) {
    // a multiplication by 10 is cheaper than an arbitrary integer
    // multiplication
-    i = 10 * i +
+    answer.mantissa = 10 * answer.mantissa +
        uint64_t(*p -
                 UC('0')); // might overflow, we will handle the overflow later
    ++p;
  }
  UC const *const end_of_integer_part = p;
-  uint32_t digit_count = uint32_t(end_of_integer_part - start_digits);
+  uint16_t digit_count = uint16_t(end_of_integer_part - start_digits);
  answer.integer = span<UC const>(start_digits, digit_count);
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
  FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) {
@ -353,43 +352,46 @@ parse_number_string(UC const *p, UC const *pend,
  }
 #endif

-  int32_t exponent = 0;
  bool const has_decimal_point = (p != pend) && (*p == options.decimal_point);
  if (has_decimal_point) {
    ++p;
    UC const *before = p;
+    uint16_t fraction = 0;
    // can occur at most twice without overflowing, but let it occur more, since
    // for integers with many digits, digit parsing is the primary bottleneck.
-    loop_parse_if_eight_digits(p, pend, i);
+    loop_parse_if_eight_digits(p, pend, answer.mantissa);

    while ((p != pend) && is_integer(*p)) {
      uint8_t const digit = uint8_t(*p - UC('0'));
-      i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+      answer.mantissa = answer.mantissa * 10 + digit; // in rare cases, this will overflow, but that's ok
      ++p;
    }
-    exponent = int32_t(before - p);
-    answer.fraction = span<UC const>(before, uint32_t(p - before));
-    digit_count -= exponent;
-  }
+    fraction = uint16_t(before - p);
+    answer.fraction = span<UC const>(before, uint16_t(p - before));
+    digit_count -= fraction;
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
    FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) {
      // at least 1 digit in fractional part
-    if (has_decimal_point && exponent == 0) {
+      if (has_decimal_point && fraction == 0) {
        return report_parse_error<UC>(p,
                                    parse_error::no_digits_in_fractional_part);
      }
    }
 #endif
+  }
  else if (digit_count == 0) { // we must have encountered at least one integer!
    return report_parse_error<UC>(p, parse_error::no_digits_in_mantissa);
  }
-  int32_t exp_number = 0; // explicit exponential part
+  // We have now parsed the integer and the fraction part of the mantissa.
+  
+  // Now we can parse the exponent part.
  if (p != pend &&
      (uint8_t(options.format & chars_format::scientific) &&
-       ((UC('e') == *p) || (UC('E') == *p)))
+       (UC('e') == *p) || (UC('E') == *p))
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
   || (uint8_t(options.format & detail::basic_fortran_fmt) &&
-        (UC('d') == *p) || (UC('D') == *p))
+      ((UC('+') == *p) || (UC('-') == *p) ||
+       (UC('d') == *p) || (UC('D') == *p)))
 #endif
  ) {
    UC const *location_of_e = p;
@ -416,14 +418,16 @@ parse_number_string(UC const *p, UC const *pend,
      p = location_of_e;
    } else {
      while ((p != pend) && is_integer(*p)) {
+        if (answer.exponent < 0x1000) {
+          // check for exponent overflow if we have too many digits.
          uint8_t const digit = uint8_t(*p - UC('0'));
-        exp_number = 10 * exp_number + digit;
+          answer.exponent = 10 * answer.exponent + digit;
+        }
        ++p;
      }
      if (neg_exp) {
-        exp_number = -exp_number;
+        answer.exponent = -answer.exponent;
      }
-      exponent += exp_number;
    }
  } else {
    // If it scientific and not fixed, we have to bail out.
@ -459,30 +463,28 @@ parse_number_string(UC const *p, UC const *pend,
      // Let us start again, this time, avoiding overflows.
      // We don't need to check if is_integer, since we use the
      // pre-tokenized spans from above.
-      i = 0;
+      answer.mantissa = 0;
      p = answer.integer.ptr;
      UC const *int_end = p + answer.integer.len();
      uint64_t const minimal_nineteen_digit_integer{1000000000000000000};
-      while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
-        i = i * 10 + uint64_t(*p - UC('0'));
+      while ((answer.mantissa < minimal_nineteen_digit_integer) && (p != int_end)) {
+        answer.mantissa = answer.mantissa * 10 + uint64_t(*p - UC('0'));
        ++p;
      }
-      if (i >= minimal_nineteen_digit_integer) { // We have a big integers
-        exponent = uint32_t(end_of_integer_part - p) + exp_number;
+      if (answer.mantissa >= minimal_nineteen_digit_integer) { // We have a big integers
+        answer.exponent += int16_t(end_of_integer_part - p);
      } else { // We have a value with a fractional component.
        p = answer.fraction.ptr;
        UC const *frac_end = p + answer.fraction.len();
-        while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
-          i = i * 10 + uint64_t(*p - UC('0'));
+        while ((answer.mantissa < minimal_nineteen_digit_integer) && (p != frac_end)) {
+          answer.mantissa = answer.mantissa * 10 + uint64_t(*p - UC('0'));
          ++p;
        }
-        exponent = uint32_t(answer.fraction.ptr - p) + exp_number;
+        answer.exponent += int16_t(answer.fraction.ptr - p);
      }
-      // We have now corrected both exponent and i, to a truncated value
+      // We have now corrected both exponent and mantissa, to a truncated value
    }
  }
-  answer.exponent = exponent;
-  answer.mantissa = i;
  return answer;
 }

@ -518,7 +520,6 @@ parse_int_string(UC const *p, UC const *pend, T &value,

  UC const *const start_num = p;

-  // use SIMD here?
  while (p != pend && *p == UC('0')) {
    ++p;
  }
@ -541,7 +542,7 @@ parse_int_string(UC const *p, UC const *pend, T &value,
    p++;
  }

-  uint32_t const digit_count = uint32_t(p - start_digits);
+  uint16_t const digit_count = uint16_t(p - start_digits);

  if (digit_count == 0) {
    if (has_leading_zeros) {
--- a/include/fast_float/bigint.h
+++ b/include/fast_float/bigint.h
@ -19,11 +19,11 @@ namespace fast_float {
 #if defined(FASTFLOAT_64BIT) && !defined(__sparc)
 #define FASTFLOAT_64BIT_LIMB 1
 typedef uint64_t limb;
-constexpr uint32_t limb_bits = 64;
+constexpr uint16_t limb_bits = 64;
 #else
 #define FASTFLOAT_32BIT_LIMB
 typedef uint32_t limb;
-constexpr uint32_t limb_bits = 32;
+constexpr uint16_t limb_bits = 32;
 #endif

 typedef span<limb> limb_span;
@ -32,15 +32,15 @@ typedef span<limb> limb_span;
 // of bits required to store the largest bigint, which is
 // `log2(10**(digits + max_exp))`, or `log2(10**(767 + 342))`, or
 // ~3600 bits, so we round to 4000.
-constexpr uint32_t bigint_bits = 4000;
-constexpr uint32_t bigint_limbs = bigint_bits / limb_bits;
+constexpr uint16_t bigint_bits = 4000;
+constexpr uint16_t bigint_limbs = bigint_bits / limb_bits;

 // vector-like type that is allocated on the stack. the entire
 // buffer is pre-allocated, and only the length changes.
-template <uint32_t size> struct stackvec {
+template <uint8_t size> struct stackvec {
  limb data[size];
  // we never need more than 150 limbs
-  uint32_t length{0};
+  uint8_t length{0};

  FASTFLOAT_CONSTEXPR20 stackvec() noexcept = default;
  stackvec(stackvec const &) = delete;
@ -53,33 +53,33 @@ template <uint32_t size> struct stackvec {
    FASTFLOAT_ASSERT(try_extend(s));
  }

-  FASTFLOAT_CONSTEXPR14 limb &operator[](uint32_t index) noexcept {
+  FASTFLOAT_CONSTEXPR14 limb &operator[](uint16_t index) noexcept {
    FASTFLOAT_DEBUG_ASSERT(index < length);
    return data[index];
  }

-  FASTFLOAT_CONSTEXPR14 const limb &operator[](uint32_t index) const noexcept {
+  FASTFLOAT_CONSTEXPR14 const limb &operator[](uint16_t index) const noexcept {
    FASTFLOAT_DEBUG_ASSERT(index < length);
    return data[index];
  }

  // index from the end of the container
-  FASTFLOAT_CONSTEXPR14 const limb &rindex(uint32_t index) const noexcept {
+  FASTFLOAT_CONSTEXPR14 const limb &rindex(uint16_t index) const noexcept {
    FASTFLOAT_DEBUG_ASSERT(index < length);
-    uint32_t rindex = length - index - 1;
+    uint16_t rindex = length - index - 1;
    return data[rindex];
  }

  // set the length, without bounds checking.
-  FASTFLOAT_CONSTEXPR14 void set_len(uint32_t len) noexcept {
+  FASTFLOAT_CONSTEXPR14 void set_len(uint8_t len) noexcept {
    length = len;
  }

-  constexpr uint32_t len() const noexcept { return length; }
+  constexpr uint8_t len() const noexcept { return length; }

  constexpr bool is_empty() const noexcept { return length == 0; }

-  constexpr uint32_t capacity() const noexcept { return size; }
+  constexpr uint8_t capacity() const noexcept { return size; }

  // append item to vector, without bounds checking
  FASTFLOAT_CONSTEXPR14 void push_unchecked(limb value) noexcept {
@ -118,9 +118,9 @@ template <uint32_t size> struct stackvec {
  // if the new size is longer than the vector, assign value to each
  // appended item.
  FASTFLOAT_CONSTEXPR20
-  void resize_unchecked(uint32_t new_len, limb value) noexcept {
+  void resize_unchecked(uint8_t new_len, limb value) noexcept {
    if (new_len > len()) {
-      uint32_t count = new_len - len();
+      uint8_t count = new_len - len();
      limb *first = data + len();
      limb *last = first + count;
      ::std::fill(first, last, value);
@ -131,7 +131,7 @@ template <uint32_t size> struct stackvec {
  }

  // try to resize the vector, returning if the vector was resized.
-  FASTFLOAT_CONSTEXPR20 bool try_resize(uint32_t new_len, limb value) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool try_resize(uint8_t new_len, limb value) noexcept {
    if (new_len > capacity()) {
      return false;
    } else {
@ -143,7 +143,7 @@ template <uint32_t size> struct stackvec {
  // check if any limbs are non-zero after the given index.
  // this needs to be done in reverse order, since the index
  // is relative to the most significant limbs.
-  FASTFLOAT_CONSTEXPR14 bool nonzero(uint32_t index) const noexcept {
+  FASTFLOAT_CONSTEXPR14 bool nonzero(uint16_t index) const noexcept {
    while (index < len()) {
      if (rindex(index) != 0) {
        return true;
@ -258,10 +258,10 @@ scalar_mul(limb x, limb y, limb &carry) noexcept {

 // add scalar value to bigint starting from offset.
 // used in grade school multiplication
-template <uint32_t size>
+template <uint8_t size>
 inline FASTFLOAT_CONSTEXPR20 bool small_add_from(stackvec<size> &vec, limb y,
                                                 uint32_t start) noexcept {
-  uint32_t index = start;
+  uint8_t index = (uint8_t)start;
  limb carry = y;
  bool overflow;
  while (carry != 0 && index < vec.len()) {
@ -276,18 +276,18 @@ inline FASTFLOAT_CONSTEXPR20 bool small_add_from(stackvec<size> &vec, limb y,
 }

 // add scalar value to bigint.
-template <uint32_t size>
+template <uint8_t size>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
 small_add(stackvec<size> &vec, limb y) noexcept {
  return small_add_from(vec, y, 0);
 }

 // multiply bigint by scalar value.
-template <uint32_t size>
+template <uint8_t size>
 inline FASTFLOAT_CONSTEXPR20 bool small_mul(stackvec<size> &vec,
                                            limb y) noexcept {
  limb carry = 0;
-  for (uint32_t index = 0; index != vec.len(); ++index) {
+  for (uint8_t index = 0; index != vec.len(); ++index) {
    vec[index] = scalar_mul(vec[index], y, carry);
  }
  if (carry != 0) {
@ -298,9 +298,9 @@ inline FASTFLOAT_CONSTEXPR20 bool small_mul(stackvec<size> &vec,

 // add bigint to bigint starting from index.
 // used in grade school multiplication
-template <uint32_t size>
+template <uint8_t size>
 FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec<size> &x, limb_span y,
-                                          uint32_t start) noexcept {
+                                          uint8_t start) noexcept {
  // the effective x buffer is from `xstart..x.len()`, so exit early
  // if we can't get that current range.
  if (x.len() < start || y.len() > x.len() - start) {
@ -308,7 +308,7 @@ FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec<size> &x, limb_span y,
  }

  bool carry = false;
-  for (uint32_t index = 0; index < y.len(); ++index) {
+  for (uint8_t index = 0; index < y.len(); ++index) {
    limb xi = x[index + start];
    limb yi = y[index];
    bool c1 = false;
@ -329,14 +329,14 @@ FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec<size> &x, limb_span y,
 }

 // add bigint to bigint.
-template <uint32_t size>
+template <uint8_t size>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
 large_add_from(stackvec<size> &x, limb_span y) noexcept {
  return large_add_from(x, y, 0);
 }

 // grade-school multiplication algorithm
-template <uint32_t size>
+template <uint8_t size>
 FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec<size> &x, limb_span y) noexcept {
  limb_span xs = limb_span(x.data, x.len());
  stackvec<size> z(xs);
@ -345,7 +345,7 @@ FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec<size> &x, limb_span y) noexcept {
  if (y.len() != 0) {
    limb y0 = y[0];
    FASTFLOAT_TRY(small_mul(x, y0));
-    for (uint32_t index = 1; index != y.len(); ++index) {
+    for (uint8_t index = 1; index != y.len(); ++index) {
      limb yi = y[index];
      stackvec<size> zi;
      if (yi != 0) {
@ -364,7 +364,7 @@ FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec<size> &x, limb_span y) noexcept {
 }

 // grade-school multiplication algorithm
-template <uint32_t size>
+template <uint8_t size>
 FASTFLOAT_CONSTEXPR20 bool large_mul(stackvec<size> &x, limb_span y) noexcept {
  if (y.len() == 1) {
    FASTFLOAT_TRY(small_mul(x, y[0]));
@ -493,7 +493,7 @@ struct bigint : pow5_tables<> {
    } else if (vec.len() < other.vec.len()) {
      return -1;
    } else {
-      for (uint32_t index = vec.len(); index > 0; --index) {
+      for (uint8_t index = vec.len(); index > 0; --index) {
        limb xi = vec[index - 1];
        limb yi = other.vec[index - 1];
        if (xi > yi) {
@ -508,7 +508,7 @@ struct bigint : pow5_tables<> {

  // shift left each limb n bits, carrying over to the new limb
  // returns true if we were able to shift all the digits.
-  FASTFLOAT_CONSTEXPR20 bool shl_bits(uint32_t n) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool shl_bits(uint16_t n) noexcept {
    // Internally, for each item, we shift left by n, and add the previous
    // right shifted limb-bits.
    // For example, we transform (for u8) shifted left 2, to:
@ -517,10 +517,10 @@ struct bigint : pow5_tables<> {
    FASTFLOAT_DEBUG_ASSERT(n != 0);
    FASTFLOAT_DEBUG_ASSERT(n < sizeof(limb) * 8);

-    uint32_t const shl = n;
-    uint32_t const shr = limb_bits - shl;
+    uint16_t const shl = n;
+    uint16_t const shr = limb_bits - shl;
    limb prev = 0;
-    for (uint32_t index = 0; index != vec.len(); ++index) {
+    for (uint8_t index = 0; index != vec.len(); ++index) {
      limb xi = vec[index];
      vec[index] = (xi << shl) | (prev >> shr);
      prev = xi;
@ -534,7 +534,7 @@ struct bigint : pow5_tables<> {
  }

  // move the limbs left by `n` limbs.
-  FASTFLOAT_CONSTEXPR20 bool shl_limbs(uint32_t n) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool shl_limbs(int16_t n) noexcept {
    FASTFLOAT_DEBUG_ASSERT(n != 0);
    if (n + vec.len() > vec.capacity()) {
      return false;
@ -555,9 +555,9 @@ struct bigint : pow5_tables<> {
  }

  // move the limbs left by `n` bits.
-  FASTFLOAT_CONSTEXPR20 bool shl(uint32_t n) noexcept {
-    uint32_t const rem = n % limb_bits;
-    uint32_t const div = n / limb_bits;
+  FASTFLOAT_CONSTEXPR20 bool shl(uint16_t n) noexcept {
+    uint16_t const rem = n % limb_bits;
+    uint16_t const div = n / limb_bits;
    if (rem != 0) {
      FASTFLOAT_TRY(shl_bits(rem));
    }
@ -568,7 +568,7 @@ struct bigint : pow5_tables<> {
  }

  // get the number of leading zeros in the bigint.
-  FASTFLOAT_CONSTEXPR20 int ctlz() const noexcept {
+  FASTFLOAT_CONSTEXPR20 uint8_t ctlz() const noexcept {
    if (vec.is_empty()) {
      return 0;
    } else {
@ -583,9 +583,9 @@ struct bigint : pow5_tables<> {
  }

  // get the number of bits in the bigint.
-  FASTFLOAT_CONSTEXPR20 int bit_length() const noexcept {
-    int lz = ctlz();
-    return int(limb_bits * vec.len()) - lz;
+  FASTFLOAT_CONSTEXPR20 uint16_t bit_length() const noexcept {
+    uint16_t lz = ctlz();
+    return uint16_t(limb_bits * vec.len()) - lz;
  }

  FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept { return small_mul(vec, y); }
@ -593,22 +593,22 @@ struct bigint : pow5_tables<> {
  FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept { return small_add(vec, y); }

  // multiply as if by 2 raised to a power.
-  FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept { return shl(exp); }
+  FASTFLOAT_CONSTEXPR20 bool pow2(int16_t exp) noexcept { return shl(exp); }

  // multiply as if by 5 raised to a power.
-  FASTFLOAT_CONSTEXPR20 bool pow5(uint32_t exp) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool pow5(int16_t exp) noexcept {
    // multiply by a power of 5
-    size_t const large_length = sizeof(large_power_of_5) / sizeof(limb);
+    uint8_t const large_length = sizeof(large_power_of_5) / sizeof(limb);
    limb_span const large = limb_span(large_power_of_5, large_length);
    while (exp >= large_step) {
      FASTFLOAT_TRY(large_mul(vec, large));
      exp -= large_step;
    }
 #ifdef FASTFLOAT_64BIT_LIMB
-    uint32_t const small_step = 27;
+    uint8_t const small_step = 27;
    limb const max_native = 7450580596923828125UL;
 #else
-    uint32_t const small_step = 13;
+    uint8_t const small_step = 13;
    limb const max_native = 1220703125U;
 #endif
    while (exp >= small_step) {
@ -627,7 +627,7 @@ struct bigint : pow5_tables<> {
  }

  // multiply as if by 10 raised to a power.
-  FASTFLOAT_CONSTEXPR20 bool pow10(uint32_t exp) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool pow10(int16_t exp) noexcept {
    FASTFLOAT_TRY(pow5(exp));
    return pow2(exp);
  }
--- a/include/fast_float/decimal_to_binary.h
+++ b/include/fast_float/decimal_to_binary.h
@ -71,12 +71,12 @@ constexpr fastfloat_really_inline int32_t power(int32_t q) noexcept {
 // for significant digits already multiplied by 10 ** q.
 template <typename binary>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR14 adjusted_mantissa
-compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept {
-  int hilz = int(w >> 63) ^ 1;
+compute_error_scaled(int64_t q, uint64_t w, int32_t lz) noexcept {
+  int32_t hilz = int32_t(w >> 63) ^ 1;
  adjusted_mantissa answer;
  answer.mantissa = w << hilz;
-  int bias = binary::mantissa_explicit_bits() - binary::minimum_exponent();
-  answer.power2 = int32_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 +
+  int32_t bias = binary::mantissa_explicit_bits() - binary::minimum_exponent();
+  answer.power2 = int16_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 +
                          invalid_am_bias);
  return answer;
 }
@ -143,7 +143,7 @@ compute_float(int64_t q, uint64_t w) noexcept {

  answer.mantissa = product.high >> shift;

-  answer.power2 = int32_t(detail::power(int32_t(q)) + upperbit - lz -
+  answer.power2 = int16_t(detail::power(int32_t(q)) + upperbit - lz -
                          binary::minimum_exponent());
  if (answer.power2 <= 0) { // we have a subnormal?
    // Here have that answer.power2 <= 0 so -answer.power2 >= 0
@ -196,7 +196,7 @@ compute_float(int64_t q, uint64_t w) noexcept {
  answer.mantissa >>= 1;
  if (answer.mantissa >= (uint64_t(2) << binary::mantissa_explicit_bits())) {
    answer.mantissa = (uint64_t(1) << binary::mantissa_explicit_bits());
-    answer.power2++; // undo previous addition
+    ++answer.power2; // undo previous addition
  }

  answer.mantissa &= ~(uint64_t(1) << binary::mantissa_explicit_bits());
--- a/include/fast_float/digit_comparison.h
+++ b/include/fast_float/digit_comparison.h
@ -39,10 +39,10 @@ constexpr static uint64_t powers_of_ten_uint64[] = {1UL,
 // effect on performance: in order to have a faster algorithm, we'd need
 // to slow down performance for faster algorithms, and this is still fast.
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int32_t
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int16_t
 scientific_exponent(parsed_number_string_t<UC> const &num) noexcept {
  uint64_t mantissa = num.mantissa;
-  int32_t exponent = num.exponent;
+  int16_t exponent = num.exponent;
  while (mantissa >= 10000) {
    mantissa /= 10000;
    exponent += 4;
@ -68,7 +68,7 @@ to_extended(T const &value) noexcept {
  constexpr equiv_uint hidden_bit_mask = binary_format<T>::hidden_bit_mask();

  adjusted_mantissa am;
-  int32_t bias = binary_format<T>::mantissa_explicit_bits() -
+  int16_t bias = binary_format<T>::mantissa_explicit_bits() -
                 binary_format<T>::minimum_exponent();
  equiv_uint bits;
 #if FASTFLOAT_HAS_BIT_CAST
@ -82,7 +82,7 @@ to_extended(T const &value) noexcept {
    am.mantissa = bits & mantissa_mask;
  } else {
    // normal
-    am.power2 = int32_t((bits & exponent_mask) >>
+    am.power2 = int16_t((bits & exponent_mask) >>
                        binary_format<T>::mantissa_explicit_bits());
    am.power2 -= bias;
    am.mantissa = (bits & mantissa_mask) | hidden_bit_mask;
@ -108,11 +108,11 @@ to_extended_halfway(T const &value) noexcept {
 template <typename T, typename callback>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round(adjusted_mantissa &am,
                                                         callback cb) noexcept {
-  int32_t mantissa_shift = 64 - binary_format<T>::mantissa_explicit_bits() - 1;
+  int16_t mantissa_shift = 64 - binary_format<T>::mantissa_explicit_bits() - 1;
  if (-am.power2 >= mantissa_shift) {
    // have a denormal float
-    int32_t shift = -am.power2 + 1;
-    cb(am, std::min<int32_t>(shift, 64));
+    int16_t shift = -am.power2 + 1;
+    cb(am, std::min<int16_t>(shift, 64));
    // check for round-up: if rounding-nearest carried us to the hidden bit.
    am.power2 = (am.mantissa <
                 (uint64_t(1) << binary_format<T>::mantissa_explicit_bits()))
@ -128,7 +128,7 @@ fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round(adjusted_mantissa &am,
  if (am.mantissa >=
      (uint64_t(2) << binary_format<T>::mantissa_explicit_bits())) {
    am.mantissa = (uint64_t(1) << binary_format<T>::mantissa_explicit_bits());
-    am.power2++;
+    ++am.power2;
  }

  // check for infinite: we could have carried to an infinite power
@ -141,7 +141,7 @@ fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round(adjusted_mantissa &am,

 template <typename callback>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void
-round_nearest_tie_even(adjusted_mantissa &am, int32_t shift,
+round_nearest_tie_even(adjusted_mantissa &am, int16_t shift,
                       callback cb) noexcept {
  uint64_t const mask = (shift == 64) ? UINT64_MAX : (uint64_t(1) << shift) - 1;
  uint64_t const halfway = (shift == 0) ? 0 : uint64_t(1) << (shift - 1);
@ -162,7 +162,7 @@ round_nearest_tie_even(adjusted_mantissa &am, int32_t shift,
 }

 fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void
-round_down(adjusted_mantissa &am, int32_t shift) noexcept {
+round_down(adjusted_mantissa &am, int16_t shift) noexcept {
  if (shift == 64) {
    am.mantissa = 0;
  } else {
@ -342,17 +342,17 @@ parse_mantissa(bigint &result, const parsed_number_string_t<UC> &num) noexcept {
 }

 template <typename T>
-inline FASTFLOAT_CONSTEXPR20 void
-positive_digit_comp(bigint &bigmant, adjusted_mantissa &am,
-                    int32_t const exponent) noexcept {
-  FASTFLOAT_ASSERT(bigmant.pow10(uint32_t(exponent)));
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+positive_digit_comp(bigint &bigmant, adjusted_mantissa am,
+                    int16_t const exponent) noexcept {
+  FASTFLOAT_ASSERT(bigmant.pow10(exponent));
  bool truncated;
  am.mantissa = bigmant.hi64(truncated);
-  int32_t bias = binary_format<T>::mantissa_explicit_bits() -
+  int16_t bias = binary_format<T>::mantissa_explicit_bits() -
             binary_format<T>::minimum_exponent();
  am.power2 = bigmant.bit_length() - 64 + bias;

-  round<T>(am, [truncated](adjusted_mantissa &a, int32_t shift) {
+  round<T>(am, [truncated](adjusted_mantissa &a, int16_t shift) {
    round_nearest_tie_even(
        a, shift,
        [truncated](bool is_odd, bool is_halfway, bool is_above) -> bool {
@ -360,6 +360,8 @@ positive_digit_comp(bigint &bigmant, adjusted_mantissa &am,
                 (is_odd && is_halfway);
        });
  });
+
+  return am;
 }

 // the scaling here is quite simple: we have, for the real digits `m * 10^e`,
@ -368,11 +370,11 @@ positive_digit_comp(bigint &bigmant, adjusted_mantissa &am,
 // we then need to scale by `2^(f- e)`, and then the two significant digits
 // are of the same magnitude.
 template <typename T>
-inline FASTFLOAT_CONSTEXPR20 void
-negative_digit_comp(bigint &bigmant, adjusted_mantissa &am,
-                    int32_t const exponent) noexcept {
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+negative_digit_comp(bigint &bigmant, adjusted_mantissa am,
+                    int16_t const exponent) noexcept {
  bigint &real_digits = bigmant;
-  const int32_t &real_exp = exponent;
+  int16_t const &real_exp = exponent;

  T b;
  {
@ -381,7 +383,7 @@ negative_digit_comp(bigint &bigmant, adjusted_mantissa &am,
    // gcc7 bug: use a lambda to remove the noexcept qualifier bug with
    // -Wnoexcept-type.
    round<T>(am_b,
-             [](adjusted_mantissa &a, int32_t shift) { round_down(a, shift); });
+             [](adjusted_mantissa &a, int16_t shift) { round_down(a, shift); });
    to_float(
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
            false,
@ -390,23 +392,23 @@ negative_digit_comp(bigint &bigmant, adjusted_mantissa &am,
  }
  adjusted_mantissa theor = to_extended_halfway(b);
  bigint theor_digits(theor.mantissa);
-  int32_t theor_exp = theor.power2;
+  int16_t theor_exp = theor.power2;

  // scale real digits and theor digits to be same power.
-  int32_t pow2_exp = theor_exp - real_exp;
-  uint32_t pow5_exp = uint32_t(-real_exp);
+  int16_t pow2_exp = theor_exp - real_exp;
+  uint16_t pow5_exp = uint16_t(-real_exp);
  if (pow5_exp != 0) {
    FASTFLOAT_ASSERT(theor_digits.pow5(pow5_exp));
  }
  if (pow2_exp > 0) {
-    FASTFLOAT_ASSERT(theor_digits.pow2(uint32_t(pow2_exp)));
+    FASTFLOAT_ASSERT(theor_digits.pow2(pow2_exp));
  } else if (pow2_exp < 0) {
-    FASTFLOAT_ASSERT(real_digits.pow2(uint32_t(-pow2_exp)));
+    FASTFLOAT_ASSERT(real_digits.pow2(-pow2_exp));
  }

  // compare digits, and use it to director rounding
  int ord = real_digits.compare(theor_digits);
-  round<T>(am, [ord](adjusted_mantissa &a, int32_t shift) {
+  round<T>(am, [ord](adjusted_mantissa &a, int16_t shift) {
    round_nearest_tie_even(
        a, shift, [ord](bool is_odd, bool _, bool __) -> bool {
          (void)_;  // not needed, since we've done our comparison
@ -420,6 +422,8 @@ negative_digit_comp(bigint &bigmant, adjusted_mantissa &am,
          }
        });
  });
+
+  return am;
 }

 // parse the significant digits as a big integer to unambiguously round the
@ -436,21 +440,21 @@ negative_digit_comp(bigint &bigmant, adjusted_mantissa &am,
 // the actual digits. we then compare the big integer representations
 // of both, and use that to direct rounding.
 template <typename T, typename UC>
-inline FASTFLOAT_CONSTEXPR20 void digit_comp(
-    parsed_number_string_t<UC> const &num, adjusted_mantissa &am) noexcept {
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa digit_comp(
+    parsed_number_string_t<UC> const &num, adjusted_mantissa am) noexcept {
  // remove the invalid exponent bias
  am.power2 -= invalid_am_bias;

  bigint bigmant;
-  int32_t const sci_exp = scientific_exponent(num);
+  int16_t const sci_exp = scientific_exponent(num);
  
  uint16_t const digits = parse_mantissa<T, UC>(bigmant, num);
  // can't underflow, since digits is at most max_digits.
-  int32_t const exponent = sci_exp + 1 - digits;
+  int16_t const exponent = sci_exp + 1 - digits;
  if (exponent >= 0) {
-    positive_digit_comp<T>(bigmant, am, exponent);
+    return positive_digit_comp<T>(bigmant, am, exponent);
  } else {
-    negative_digit_comp<T>(bigmant, am, exponent);
+    return negative_digit_comp<T>(bigmant, am, exponent);
  }
 }

--- a/include/fast_float/float_common.h
+++ b/include/fast_float/float_common.h
@ -293,15 +293,15 @@ fastfloat_strncasecmp(UC const *actual_mixedcase, UC const *expected_lowercase,
 // a pointer and a length to a contiguous block of memory
 template <typename T> struct span {
  T const *ptr;
-  uint32_t length;
+  uint16_t length;

-  constexpr span(T const *_ptr, uint32_t _length) : ptr(_ptr), length(_length) {}
+  constexpr span(T const *_ptr, uint16_t _length) : ptr(_ptr), length(_length) {}

  constexpr span() : ptr(nullptr), length(0) {}

-  constexpr uint32_t len() const noexcept { return length; }
+  constexpr uint16_t len() const noexcept { return length; }

-  FASTFLOAT_CONSTEXPR14 const T &operator[](uint32_t index) const noexcept {
+  FASTFLOAT_CONSTEXPR14 const T &operator[](uint16_t index) const noexcept {
    FASTFLOAT_DEBUG_ASSERT(index < length);
    return ptr[index];
  }
@ -318,8 +318,8 @@ struct value128 {
 };

 /* Helper C++14 constexpr generic implementation of leading_zeroes */
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int
-leading_zeroes_generic(uint64_t input_num, int last_bit = 0) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint8_t
+leading_zeroes_generic(uint64_t input_num, uint64_t last_bit = 0) {
  if (input_num & uint64_t(0xffffffff00000000)) {
    input_num >>= 32;
    last_bit |= 32;
@ -343,11 +343,11 @@ leading_zeroes_generic(uint64_t input_num, int last_bit = 0) {
  if (input_num & uint64_t(0x2)) { /* input_num >>=  1; */
    last_bit |= 1;
  }
-  return 63 - last_bit;
+  return 63 - (uint8_t)last_bit;
 }

 /* result might be undefined when input_num is zero */
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20 int
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint8_t
 leading_zeroes(uint64_t input_num) noexcept {
  assert(input_num > 0);
  FASTFLOAT_ASSUME(input_num > 0);
@ -360,12 +360,12 @@ leading_zeroes(uint64_t input_num) noexcept {
  // Search the mask data from most significant bit (MSB)
  // to least significant bit (LSB) for a set bit (1).
  _BitScanReverse64(&leading_zero, input_num);
-  return (int)(63 - leading_zero);
+  return (uint8_t)(63 - leading_zero);
 #else
-  return leading_zeroes_generic(input_num);
+  return (uint8_t)leading_zeroes_generic(input_num);
 #endif
 #else
-  return __builtin_clzll(input_num);
+  return (uint8_t)__builtin_clzll(input_num);
 #endif
 }

@ -429,7 +429,7 @@ full_multiplication(uint64_t a, uint64_t b) noexcept {

 struct adjusted_mantissa {
  uint64_t mantissa;
-  int32_t power2; // a negative value indicates an invalid result
+  int16_t power2; // a negative value indicates an invalid result
  adjusted_mantissa() noexcept = default;

  constexpr bool operator==(adjusted_mantissa const &o) const noexcept {
--- a/include/fast_float/parse_number.h
+++ b/include/fast_float/parse_number.h
@ -283,7 +283,7 @@ from_chars_advanced(parsed_number_string_t<UC> const &pns, T &value) noexcept {
  // and we have an invalid power (am.power2 < 0), then we need to go the long
  // way around again. This is very uncommon.
  if (am.power2 < 0) {
-    digit_comp<T>(pns, am);
+    am = digit_comp<T>(pns, am);
  }
  to_float(
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN