From 8e1fda5d08a3e0bce1bb0fe1ac1a4d14cb09342a Mon Sep 17 00:00:00 2001
From: IRainman <a.rainman@gmail.com>
Date: Thu, 10 Apr 2025 17:18:08 +0300
Subject: [PATCH] fixes and cleanup for the parse_number_string function.
 exponent value is always less than in16_t.

original main:
Tests:
 time is: 44278ms.

size of my tests 389.0k
size of my program 164.0k

my main:
Tests:
 time is: 42015ms.

size of my tests 389.0k
size of my program 164.0k

my main with FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
Tests:
 time is: 41282ms.

size of my tests 386.5k
size of my program 161.5k

After this I'll try it on my partner Linux machine with the original tests and compare much better.
---
 include/fast_float/ascii_number.h      | 77 +++++++++++----------
 include/fast_float/bigint.h            | 96 +++++++++++++-------------
 include/fast_float/decimal_to_binary.h | 12 ++--
 include/fast_float/digit_comparison.h  | 70 ++++++++++---------
 include/fast_float/float_common.h      | 24 +++----
 include/fast_float/parse_number.h      |  2 +-
 6 files changed, 143 insertions(+), 138 deletions(-)
diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
index a3dd7c4..ec2b6b8 100644
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@@ -50,7 +50,7 @@ fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
 read8_to_u64(UC const *chars) {
   if (cpp20_and_in_constexpr() || !std::is_same<UC, char>::value) {
     uint64_t val = 0;
-    for (int i = 0; i < 8; ++i) {
+    for (uint8_t i = 0; i != 8; ++i) {
       val |= uint64_t(uint8_t(*chars)) << (i * 8);
       ++chars;
     }
@@ -261,7 +261,7 @@ enum class parse_error {
 
 template <typename UC> struct parsed_number_string_t {
   uint64_t mantissa{0};
-  int32_t exponent{0};
+  int16_t exponent{0};
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
   bool negative{false};
 #endif
@@ -327,18 +327,17 @@ parse_number_string(UC const *p, UC const *pend,
 
   UC const *const start_digits = p;
 
-  uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
-
+  // an unsigned int avoids signed overflows (which are bad)
   while ((p != pend) && is_integer(*p)) {
     // a multiplication by 10 is cheaper than an arbitrary integer
     // multiplication
-    i = 10 * i +
+    answer.mantissa = 10 * answer.mantissa +
         uint64_t(*p -
                  UC('0')); // might overflow, we will handle the overflow later
     ++p;
   }
   UC const *const end_of_integer_part = p;
-  uint32_t digit_count = uint32_t(end_of_integer_part - start_digits);
+  uint16_t digit_count = uint16_t(end_of_integer_part - start_digits);
   answer.integer = span<UC const>(start_digits, digit_count);
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
   FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) {
@@ -353,43 +352,46 @@ parse_number_string(UC const *p, UC const *pend,
   }
 #endif
 
-  int32_t exponent = 0;
   bool const has_decimal_point = (p != pend) && (*p == options.decimal_point);
   if (has_decimal_point) {
     ++p;
     UC const *before = p;
+    uint16_t fraction = 0;
     // can occur at most twice without overflowing, but let it occur more, since
     // for integers with many digits, digit parsing is the primary bottleneck.
-    loop_parse_if_eight_digits(p, pend, i);
+    loop_parse_if_eight_digits(p, pend, answer.mantissa);
 
     while ((p != pend) && is_integer(*p)) {
       uint8_t const digit = uint8_t(*p - UC('0'));
-      i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
+      answer.mantissa = answer.mantissa * 10 + digit; // in rare cases, this will overflow, but that's ok
       ++p;
     }
-    exponent = int32_t(before - p);
-    answer.fraction = span<UC const>(before, uint32_t(p - before));
-    digit_count -= exponent;
-  }
+    fraction = uint16_t(before - p);
+    answer.fraction = span<UC const>(before, uint16_t(p - before));
+    digit_count -= fraction;
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
-  FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) {
-    // at least 1 digit in fractional part
-    if (has_decimal_point && exponent == 0) {
-      return report_parse_error<UC>(p,
+    FASTFLOAT_IF_CONSTEXPR17(basic_json_fmt) {
+      // at least 1 digit in fractional part
+      if (has_decimal_point && fraction == 0) {
+        return report_parse_error<UC>(p,
                                     parse_error::no_digits_in_fractional_part);
+      }
     }
-  }
 #endif
+  }
   else if (digit_count == 0) { // we must have encountered at least one integer!
     return report_parse_error<UC>(p, parse_error::no_digits_in_mantissa);
   }
-  int32_t exp_number = 0; // explicit exponential part
+  // We have now parsed the integer and the fraction part of the mantissa.
+  
+  // Now we can parse the exponent part.
   if (p != pend &&
       (uint8_t(options.format & chars_format::scientific) &&
-       ((UC('e') == *p) || (UC('E') == *p)))
+       (UC('e') == *p) || (UC('E') == *p))
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
    || (uint8_t(options.format & detail::basic_fortran_fmt) &&
-        (UC('d') == *p) || (UC('D') == *p))
+      ((UC('+') == *p) || (UC('-') == *p) ||
+       (UC('d') == *p) || (UC('D') == *p)))
 #endif
   ) {
     UC const *location_of_e = p;
@@ -416,14 +418,16 @@ parse_number_string(UC const *p, UC const *pend,
       p = location_of_e;
     } else {
       while ((p != pend) && is_integer(*p)) {
-        uint8_t const digit = uint8_t(*p - UC('0'));
-        exp_number = 10 * exp_number + digit;
+        if (answer.exponent < 0x1000) {
+          // check for exponent overflow if we have too many digits.
+          uint8_t const digit = uint8_t(*p - UC('0'));
+          answer.exponent = 10 * answer.exponent + digit;
+        }
         ++p;
       }
       if (neg_exp) {
-        exp_number = -exp_number;
+        answer.exponent = -answer.exponent;
       }
-      exponent += exp_number;
     }
   } else {
     // If it scientific and not fixed, we have to bail out.
@@ -459,30 +463,28 @@ parse_number_string(UC const *p, UC const *pend,
       // Let us start again, this time, avoiding overflows.
       // We don't need to check if is_integer, since we use the
       // pre-tokenized spans from above.
-      i = 0;
+      answer.mantissa = 0;
       p = answer.integer.ptr;
       UC const *int_end = p + answer.integer.len();
       uint64_t const minimal_nineteen_digit_integer{1000000000000000000};
-      while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
-        i = i * 10 + uint64_t(*p - UC('0'));
+      while ((answer.mantissa < minimal_nineteen_digit_integer) && (p != int_end)) {
+        answer.mantissa = answer.mantissa * 10 + uint64_t(*p - UC('0'));
         ++p;
       }
-      if (i >= minimal_nineteen_digit_integer) { // We have a big integers
-        exponent = uint32_t(end_of_integer_part - p) + exp_number;
+      if (answer.mantissa >= minimal_nineteen_digit_integer) { // We have a big integers
+        answer.exponent += int16_t(end_of_integer_part - p);
       } else { // We have a value with a fractional component.
         p = answer.fraction.ptr;
         UC const *frac_end = p + answer.fraction.len();
-        while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
-          i = i * 10 + uint64_t(*p - UC('0'));
+        while ((answer.mantissa < minimal_nineteen_digit_integer) && (p != frac_end)) {
+          answer.mantissa = answer.mantissa * 10 + uint64_t(*p - UC('0'));
           ++p;
         }
-        exponent = uint32_t(answer.fraction.ptr - p) + exp_number;
+        answer.exponent += int16_t(answer.fraction.ptr - p);
       }
-      // We have now corrected both exponent and i, to a truncated value
+      // We have now corrected both exponent and mantissa, to a truncated value
     }
   }
-  answer.exponent = exponent;
-  answer.mantissa = i;
   return answer;
 }
 
@@ -518,7 +520,6 @@ parse_int_string(UC const *p, UC const *pend, T &value,
 
   UC const *const start_num = p;
 
-  // use SIMD here?
   while (p != pend && *p == UC('0')) {
     ++p;
   }
@@ -541,7 +542,7 @@ parse_int_string(UC const *p, UC const *pend, T &value,
     p++;
   }
 
-  uint32_t const digit_count = uint32_t(p - start_digits);
+  uint16_t const digit_count = uint16_t(p - start_digits);
 
   if (digit_count == 0) {
     if (has_leading_zeros) {
diff --git a/include/fast_float/bigint.h b/include/fast_float/bigint.h
index 7a481b4..aa18c2f 100644
--- a/include/fast_float/bigint.h
+++ b/include/fast_float/bigint.h
@@ -19,11 +19,11 @@ namespace fast_float {
 #if defined(FASTFLOAT_64BIT) && !defined(__sparc)
 #define FASTFLOAT_64BIT_LIMB 1
 typedef uint64_t limb;
-constexpr uint32_t limb_bits = 64;
+constexpr uint16_t limb_bits = 64;
 #else
 #define FASTFLOAT_32BIT_LIMB
 typedef uint32_t limb;
-constexpr uint32_t limb_bits = 32;
+constexpr uint16_t limb_bits = 32;
 #endif
 
 typedef span<limb> limb_span;
@@ -32,15 +32,15 @@ typedef span<limb> limb_span;
 // of bits required to store the largest bigint, which is
 // `log2(10**(digits + max_exp))`, or `log2(10**(767 + 342))`, or
 // ~3600 bits, so we round to 4000.
-constexpr uint32_t bigint_bits = 4000;
-constexpr uint32_t bigint_limbs = bigint_bits / limb_bits;
+constexpr uint16_t bigint_bits = 4000;
+constexpr uint16_t bigint_limbs = bigint_bits / limb_bits;
 
 // vector-like type that is allocated on the stack. the entire
 // buffer is pre-allocated, and only the length changes.
-template <uint32_t size> struct stackvec {
+template <uint8_t size> struct stackvec {
   limb data[size];
   // we never need more than 150 limbs
-  uint32_t length{0};
+  uint8_t length{0};
 
   FASTFLOAT_CONSTEXPR20 stackvec() noexcept = default;
   stackvec(stackvec const &) = delete;
@@ -53,33 +53,33 @@ template <uint32_t size> struct stackvec {
     FASTFLOAT_ASSERT(try_extend(s));
   }
 
-  FASTFLOAT_CONSTEXPR14 limb &operator[](uint32_t index) noexcept {
+  FASTFLOAT_CONSTEXPR14 limb &operator[](uint16_t index) noexcept {
     FASTFLOAT_DEBUG_ASSERT(index < length);
     return data[index];
   }
 
-  FASTFLOAT_CONSTEXPR14 const limb &operator[](uint32_t index) const noexcept {
+  FASTFLOAT_CONSTEXPR14 const limb &operator[](uint16_t index) const noexcept {
     FASTFLOAT_DEBUG_ASSERT(index < length);
     return data[index];
   }
 
   // index from the end of the container
-  FASTFLOAT_CONSTEXPR14 const limb &rindex(uint32_t index) const noexcept {
+  FASTFLOAT_CONSTEXPR14 const limb &rindex(uint16_t index) const noexcept {
     FASTFLOAT_DEBUG_ASSERT(index < length);
-    uint32_t rindex = length - index - 1;
+    uint16_t rindex = length - index - 1;
     return data[rindex];
   }
 
   // set the length, without bounds checking.
-  FASTFLOAT_CONSTEXPR14 void set_len(uint32_t len) noexcept {
+  FASTFLOAT_CONSTEXPR14 void set_len(uint8_t len) noexcept {
     length = len;
   }
 
-  constexpr uint32_t len() const noexcept { return length; }
+  constexpr uint8_t len() const noexcept { return length; }
 
   constexpr bool is_empty() const noexcept { return length == 0; }
 
-  constexpr uint32_t capacity() const noexcept { return size; }
+  constexpr uint8_t capacity() const noexcept { return size; }
 
   // append item to vector, without bounds checking
   FASTFLOAT_CONSTEXPR14 void push_unchecked(limb value) noexcept {
@@ -118,9 +118,9 @@ template <uint32_t size> struct stackvec {
   // if the new size is longer than the vector, assign value to each
   // appended item.
   FASTFLOAT_CONSTEXPR20
-  void resize_unchecked(uint32_t new_len, limb value) noexcept {
+  void resize_unchecked(uint8_t new_len, limb value) noexcept {
     if (new_len > len()) {
-      uint32_t count = new_len - len();
+      uint8_t count = new_len - len();
       limb *first = data + len();
       limb *last = first + count;
       ::std::fill(first, last, value);
@@ -131,7 +131,7 @@ template <uint32_t size> struct stackvec {
   }
 
   // try to resize the vector, returning if the vector was resized.
-  FASTFLOAT_CONSTEXPR20 bool try_resize(uint32_t new_len, limb value) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool try_resize(uint8_t new_len, limb value) noexcept {
     if (new_len > capacity()) {
       return false;
     } else {
@@ -143,7 +143,7 @@ template <uint32_t size> struct stackvec {
   // check if any limbs are non-zero after the given index.
   // this needs to be done in reverse order, since the index
   // is relative to the most significant limbs.
-  FASTFLOAT_CONSTEXPR14 bool nonzero(uint32_t index) const noexcept {
+  FASTFLOAT_CONSTEXPR14 bool nonzero(uint16_t index) const noexcept {
     while (index < len()) {
       if (rindex(index) != 0) {
         return true;
@@ -258,10 +258,10 @@ scalar_mul(limb x, limb y, limb &carry) noexcept {
 
 // add scalar value to bigint starting from offset.
 // used in grade school multiplication
-template <uint32_t size>
+template <uint8_t size>
 inline FASTFLOAT_CONSTEXPR20 bool small_add_from(stackvec<size> &vec, limb y,
                                                  uint32_t start) noexcept {
-  uint32_t index = start;
+  uint8_t index = (uint8_t)start;
   limb carry = y;
   bool overflow;
   while (carry != 0 && index < vec.len()) {
@@ -276,18 +276,18 @@ inline FASTFLOAT_CONSTEXPR20 bool small_add_from(stackvec<size> &vec, limb y,
 }
 
 // add scalar value to bigint.
-template <uint32_t size>
+template <uint8_t size>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
 small_add(stackvec<size> &vec, limb y) noexcept {
   return small_add_from(vec, y, 0);
 }
 
 // multiply bigint by scalar value.
-template <uint32_t size>
+template <uint8_t size>
 inline FASTFLOAT_CONSTEXPR20 bool small_mul(stackvec<size> &vec,
                                             limb y) noexcept {
   limb carry = 0;
-  for (uint32_t index = 0; index != vec.len(); ++index) {
+  for (uint8_t index = 0; index != vec.len(); ++index) {
     vec[index] = scalar_mul(vec[index], y, carry);
   }
   if (carry != 0) {
@@ -298,9 +298,9 @@ inline FASTFLOAT_CONSTEXPR20 bool small_mul(stackvec<size> &vec,
 
 // add bigint to bigint starting from index.
 // used in grade school multiplication
-template <uint32_t size>
+template <uint8_t size>
 FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec<size> &x, limb_span y,
-                                          uint32_t start) noexcept {
+                                          uint8_t start) noexcept {
   // the effective x buffer is from `xstart..x.len()`, so exit early
   // if we can't get that current range.
   if (x.len() < start || y.len() > x.len() - start) {
@@ -308,7 +308,7 @@ FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec<size> &x, limb_span y,
   }
 
   bool carry = false;
-  for (uint32_t index = 0; index < y.len(); ++index) {
+  for (uint8_t index = 0; index < y.len(); ++index) {
     limb xi = x[index + start];
     limb yi = y[index];
     bool c1 = false;
@@ -329,14 +329,14 @@ FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec<size> &x, limb_span y,
 }
 
 // add bigint to bigint.
-template <uint32_t size>
+template <uint8_t size>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
 large_add_from(stackvec<size> &x, limb_span y) noexcept {
   return large_add_from(x, y, 0);
 }
 
 // grade-school multiplication algorithm
-template <uint32_t size>
+template <uint8_t size>
 FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec<size> &x, limb_span y) noexcept {
   limb_span xs = limb_span(x.data, x.len());
   stackvec<size> z(xs);
@@ -345,7 +345,7 @@ FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec<size> &x, limb_span y) noexcept {
   if (y.len() != 0) {
     limb y0 = y[0];
     FASTFLOAT_TRY(small_mul(x, y0));
-    for (uint32_t index = 1; index != y.len(); ++index) {
+    for (uint8_t index = 1; index != y.len(); ++index) {
       limb yi = y[index];
       stackvec<size> zi;
       if (yi != 0) {
@@ -364,7 +364,7 @@ FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec<size> &x, limb_span y) noexcept {
 }
 
 // grade-school multiplication algorithm
-template <uint32_t size>
+template <uint8_t size>
 FASTFLOAT_CONSTEXPR20 bool large_mul(stackvec<size> &x, limb_span y) noexcept {
   if (y.len() == 1) {
     FASTFLOAT_TRY(small_mul(x, y[0]));
@@ -493,7 +493,7 @@ struct bigint : pow5_tables<> {
     } else if (vec.len() < other.vec.len()) {
       return -1;
     } else {
-      for (uint32_t index = vec.len(); index > 0; --index) {
+      for (uint8_t index = vec.len(); index > 0; --index) {
         limb xi = vec[index - 1];
         limb yi = other.vec[index - 1];
         if (xi > yi) {
@@ -508,7 +508,7 @@ struct bigint : pow5_tables<> {
 
   // shift left each limb n bits, carrying over to the new limb
   // returns true if we were able to shift all the digits.
-  FASTFLOAT_CONSTEXPR20 bool shl_bits(uint32_t n) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool shl_bits(uint16_t n) noexcept {
     // Internally, for each item, we shift left by n, and add the previous
     // right shifted limb-bits.
     // For example, we transform (for u8) shifted left 2, to:
@@ -517,10 +517,10 @@ struct bigint : pow5_tables<> {
     FASTFLOAT_DEBUG_ASSERT(n != 0);
     FASTFLOAT_DEBUG_ASSERT(n < sizeof(limb) * 8);
 
-    uint32_t const shl = n;
-    uint32_t const shr = limb_bits - shl;
+    uint16_t const shl = n;
+    uint16_t const shr = limb_bits - shl;
     limb prev = 0;
-    for (uint32_t index = 0; index != vec.len(); ++index) {
+    for (uint8_t index = 0; index != vec.len(); ++index) {
       limb xi = vec[index];
       vec[index] = (xi << shl) | (prev >> shr);
       prev = xi;
@@ -534,7 +534,7 @@ struct bigint : pow5_tables<> {
   }
 
   // move the limbs left by `n` limbs.
-  FASTFLOAT_CONSTEXPR20 bool shl_limbs(uint32_t n) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool shl_limbs(int16_t n) noexcept {
     FASTFLOAT_DEBUG_ASSERT(n != 0);
     if (n + vec.len() > vec.capacity()) {
       return false;
@@ -555,9 +555,9 @@ struct bigint : pow5_tables<> {
   }
 
   // move the limbs left by `n` bits.
-  FASTFLOAT_CONSTEXPR20 bool shl(uint32_t n) noexcept {
-    uint32_t const rem = n % limb_bits;
-    uint32_t const div = n / limb_bits;
+  FASTFLOAT_CONSTEXPR20 bool shl(uint16_t n) noexcept {
+    uint16_t const rem = n % limb_bits;
+    uint16_t const div = n / limb_bits;
     if (rem != 0) {
       FASTFLOAT_TRY(shl_bits(rem));
     }
@@ -568,7 +568,7 @@ struct bigint : pow5_tables<> {
   }
 
   // get the number of leading zeros in the bigint.
-  FASTFLOAT_CONSTEXPR20 int ctlz() const noexcept {
+  FASTFLOAT_CONSTEXPR20 uint8_t ctlz() const noexcept {
     if (vec.is_empty()) {
       return 0;
     } else {
@@ -583,9 +583,9 @@ struct bigint : pow5_tables<> {
   }
 
   // get the number of bits in the bigint.
-  FASTFLOAT_CONSTEXPR20 int bit_length() const noexcept {
-    int lz = ctlz();
-    return int(limb_bits * vec.len()) - lz;
+  FASTFLOAT_CONSTEXPR20 uint16_t bit_length() const noexcept {
+    uint16_t lz = ctlz();
+    return uint16_t(limb_bits * vec.len()) - lz;
   }
 
   FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept { return small_mul(vec, y); }
@@ -593,22 +593,22 @@ struct bigint : pow5_tables<> {
   FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept { return small_add(vec, y); }
 
   // multiply as if by 2 raised to a power.
-  FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept { return shl(exp); }
+  FASTFLOAT_CONSTEXPR20 bool pow2(int16_t exp) noexcept { return shl(exp); }
 
   // multiply as if by 5 raised to a power.
-  FASTFLOAT_CONSTEXPR20 bool pow5(uint32_t exp) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool pow5(int16_t exp) noexcept {
     // multiply by a power of 5
-    size_t const large_length = sizeof(large_power_of_5) / sizeof(limb);
+    uint8_t const large_length = sizeof(large_power_of_5) / sizeof(limb);
     limb_span const large = limb_span(large_power_of_5, large_length);
     while (exp >= large_step) {
       FASTFLOAT_TRY(large_mul(vec, large));
       exp -= large_step;
     }
 #ifdef FASTFLOAT_64BIT_LIMB
-    uint32_t const small_step = 27;
+    uint8_t const small_step = 27;
     limb const max_native = 7450580596923828125UL;
 #else
-    uint32_t const small_step = 13;
+    uint8_t const small_step = 13;
     limb const max_native = 1220703125U;
 #endif
     while (exp >= small_step) {
@@ -627,7 +627,7 @@ struct bigint : pow5_tables<> {
   }
 
   // multiply as if by 10 raised to a power.
-  FASTFLOAT_CONSTEXPR20 bool pow10(uint32_t exp) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool pow10(int16_t exp) noexcept {
     FASTFLOAT_TRY(pow5(exp));
     return pow2(exp);
   }
diff --git a/include/fast_float/decimal_to_binary.h b/include/fast_float/decimal_to_binary.h
index 2255716..a334e18 100644
--- a/include/fast_float/decimal_to_binary.h
+++ b/include/fast_float/decimal_to_binary.h
@@ -71,12 +71,12 @@ constexpr fastfloat_really_inline int32_t power(int32_t q) noexcept {
 // for significant digits already multiplied by 10 ** q.
 template <typename binary>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR14 adjusted_mantissa
-compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept {
-  int hilz = int(w >> 63) ^ 1;
+compute_error_scaled(int64_t q, uint64_t w, int32_t lz) noexcept {
+  int32_t hilz = int32_t(w >> 63) ^ 1;
   adjusted_mantissa answer;
   answer.mantissa = w << hilz;
-  int bias = binary::mantissa_explicit_bits() - binary::minimum_exponent();
-  answer.power2 = int32_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 +
+  int32_t bias = binary::mantissa_explicit_bits() - binary::minimum_exponent();
+  answer.power2 = int16_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 +
                           invalid_am_bias);
   return answer;
 }
@@ -143,7 +143,7 @@ compute_float(int64_t q, uint64_t w) noexcept {
 
   answer.mantissa = product.high >> shift;
 
-  answer.power2 = int32_t(detail::power(int32_t(q)) + upperbit - lz -
+  answer.power2 = int16_t(detail::power(int32_t(q)) + upperbit - lz -
                           binary::minimum_exponent());
   if (answer.power2 <= 0) { // we have a subnormal?
     // Here have that answer.power2 <= 0 so -answer.power2 >= 0
@@ -196,7 +196,7 @@ compute_float(int64_t q, uint64_t w) noexcept {
   answer.mantissa >>= 1;
   if (answer.mantissa >= (uint64_t(2) << binary::mantissa_explicit_bits())) {
     answer.mantissa = (uint64_t(1) << binary::mantissa_explicit_bits());
-    answer.power2++; // undo previous addition
+    ++answer.power2; // undo previous addition
   }
 
   answer.mantissa &= ~(uint64_t(1) << binary::mantissa_explicit_bits());
diff --git a/include/fast_float/digit_comparison.h b/include/fast_float/digit_comparison.h
index 4090a69..82a2953 100644
--- a/include/fast_float/digit_comparison.h
+++ b/include/fast_float/digit_comparison.h
@@ -39,10 +39,10 @@ constexpr static uint64_t powers_of_ten_uint64[] = {1UL,
 // effect on performance: in order to have a faster algorithm, we'd need
 // to slow down performance for faster algorithms, and this is still fast.
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int32_t
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int16_t
 scientific_exponent(parsed_number_string_t<UC> const &num) noexcept {
   uint64_t mantissa = num.mantissa;
-  int32_t exponent = num.exponent;
+  int16_t exponent = num.exponent;
   while (mantissa >= 10000) {
     mantissa /= 10000;
     exponent += 4;
@@ -68,7 +68,7 @@ to_extended(T const &value) noexcept {
   constexpr equiv_uint hidden_bit_mask = binary_format<T>::hidden_bit_mask();
 
   adjusted_mantissa am;
-  int32_t bias = binary_format<T>::mantissa_explicit_bits() -
+  int16_t bias = binary_format<T>::mantissa_explicit_bits() -
                  binary_format<T>::minimum_exponent();
   equiv_uint bits;
 #if FASTFLOAT_HAS_BIT_CAST
@@ -82,7 +82,7 @@ to_extended(T const &value) noexcept {
     am.mantissa = bits & mantissa_mask;
   } else {
     // normal
-    am.power2 = int32_t((bits & exponent_mask) >>
+    am.power2 = int16_t((bits & exponent_mask) >>
                         binary_format<T>::mantissa_explicit_bits());
     am.power2 -= bias;
     am.mantissa = (bits & mantissa_mask) | hidden_bit_mask;
@@ -108,11 +108,11 @@ to_extended_halfway(T const &value) noexcept {
 template <typename T, typename callback>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round(adjusted_mantissa &am,
                                                          callback cb) noexcept {
-  int32_t mantissa_shift = 64 - binary_format<T>::mantissa_explicit_bits() - 1;
+  int16_t mantissa_shift = 64 - binary_format<T>::mantissa_explicit_bits() - 1;
   if (-am.power2 >= mantissa_shift) {
     // have a denormal float
-    int32_t shift = -am.power2 + 1;
-    cb(am, std::min<int32_t>(shift, 64));
+    int16_t shift = -am.power2 + 1;
+    cb(am, std::min<int16_t>(shift, 64));
     // check for round-up: if rounding-nearest carried us to the hidden bit.
     am.power2 = (am.mantissa <
                  (uint64_t(1) << binary_format<T>::mantissa_explicit_bits()))
@@ -128,7 +128,7 @@ fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round(adjusted_mantissa &am,
   if (am.mantissa >=
       (uint64_t(2) << binary_format<T>::mantissa_explicit_bits())) {
     am.mantissa = (uint64_t(1) << binary_format<T>::mantissa_explicit_bits());
-    am.power2++;
+    ++am.power2;
   }
 
   // check for infinite: we could have carried to an infinite power
@@ -141,7 +141,7 @@ fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round(adjusted_mantissa &am,
 
 template <typename callback>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void
-round_nearest_tie_even(adjusted_mantissa &am, int32_t shift,
+round_nearest_tie_even(adjusted_mantissa &am, int16_t shift,
                        callback cb) noexcept {
   uint64_t const mask = (shift == 64) ? UINT64_MAX : (uint64_t(1) << shift) - 1;
   uint64_t const halfway = (shift == 0) ? 0 : uint64_t(1) << (shift - 1);
@@ -162,7 +162,7 @@ round_nearest_tie_even(adjusted_mantissa &am, int32_t shift,
 }
 
 fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void
-round_down(adjusted_mantissa &am, int32_t shift) noexcept {
+round_down(adjusted_mantissa &am, int16_t shift) noexcept {
   if (shift == 64) {
     am.mantissa = 0;
   } else {
@@ -342,17 +342,17 @@ parse_mantissa(bigint &result, const parsed_number_string_t<UC> &num) noexcept {
 }
 
 template <typename T>
-inline FASTFLOAT_CONSTEXPR20 void
-positive_digit_comp(bigint &bigmant, adjusted_mantissa &am,
-                    int32_t const exponent) noexcept {
-  FASTFLOAT_ASSERT(bigmant.pow10(uint32_t(exponent)));
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+positive_digit_comp(bigint &bigmant, adjusted_mantissa am,
+                    int16_t const exponent) noexcept {
+  FASTFLOAT_ASSERT(bigmant.pow10(exponent));
   bool truncated;
   am.mantissa = bigmant.hi64(truncated);
-  int32_t bias = binary_format<T>::mantissa_explicit_bits() -
+  int16_t bias = binary_format<T>::mantissa_explicit_bits() -
              binary_format<T>::minimum_exponent();
   am.power2 = bigmant.bit_length() - 64 + bias;
 
-  round<T>(am, [truncated](adjusted_mantissa &a, int32_t shift) {
+  round<T>(am, [truncated](adjusted_mantissa &a, int16_t shift) {
     round_nearest_tie_even(
         a, shift,
         [truncated](bool is_odd, bool is_halfway, bool is_above) -> bool {
@@ -360,6 +360,8 @@ positive_digit_comp(bigint &bigmant, adjusted_mantissa &am,
                  (is_odd && is_halfway);
         });
   });
+
+  return am;
 }
 
 // the scaling here is quite simple: we have, for the real digits `m * 10^e`,
@@ -368,11 +370,11 @@ positive_digit_comp(bigint &bigmant, adjusted_mantissa &am,
 // we then need to scale by `2^(f- e)`, and then the two significant digits
 // are of the same magnitude.
 template <typename T>
-inline FASTFLOAT_CONSTEXPR20 void
-negative_digit_comp(bigint &bigmant, adjusted_mantissa &am,
-                    int32_t const exponent) noexcept {
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+negative_digit_comp(bigint &bigmant, adjusted_mantissa am,
+                    int16_t const exponent) noexcept {
   bigint &real_digits = bigmant;
-  const int32_t &real_exp = exponent;
+  int16_t const &real_exp = exponent;
 
   T b;
   {
@@ -381,7 +383,7 @@ negative_digit_comp(bigint &bigmant, adjusted_mantissa &am,
     // gcc7 bug: use a lambda to remove the noexcept qualifier bug with
     // -Wnoexcept-type.
     round<T>(am_b,
-             [](adjusted_mantissa &a, int32_t shift) { round_down(a, shift); });
+             [](adjusted_mantissa &a, int16_t shift) { round_down(a, shift); });
     to_float(
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN
             false,
@@ -390,23 +392,23 @@ negative_digit_comp(bigint &bigmant, adjusted_mantissa &am,
   }
   adjusted_mantissa theor = to_extended_halfway(b);
   bigint theor_digits(theor.mantissa);
-  int32_t theor_exp = theor.power2;
+  int16_t theor_exp = theor.power2;
 
   // scale real digits and theor digits to be same power.
-  int32_t pow2_exp = theor_exp - real_exp;
-  uint32_t pow5_exp = uint32_t(-real_exp);
+  int16_t pow2_exp = theor_exp - real_exp;
+  uint16_t pow5_exp = uint16_t(-real_exp);
   if (pow5_exp != 0) {
     FASTFLOAT_ASSERT(theor_digits.pow5(pow5_exp));
   }
   if (pow2_exp > 0) {
-    FASTFLOAT_ASSERT(theor_digits.pow2(uint32_t(pow2_exp)));
+    FASTFLOAT_ASSERT(theor_digits.pow2(pow2_exp));
   } else if (pow2_exp < 0) {
-    FASTFLOAT_ASSERT(real_digits.pow2(uint32_t(-pow2_exp)));
+    FASTFLOAT_ASSERT(real_digits.pow2(-pow2_exp));
   }
 
   // compare digits, and use it to director rounding
   int ord = real_digits.compare(theor_digits);
-  round<T>(am, [ord](adjusted_mantissa &a, int32_t shift) {
+  round<T>(am, [ord](adjusted_mantissa &a, int16_t shift) {
     round_nearest_tie_even(
         a, shift, [ord](bool is_odd, bool _, bool __) -> bool {
           (void)_;  // not needed, since we've done our comparison
@@ -420,6 +422,8 @@ negative_digit_comp(bigint &bigmant, adjusted_mantissa &am,
           }
         });
   });
+
+  return am;
 }
 
 // parse the significant digits as a big integer to unambiguously round the
@@ -436,21 +440,21 @@ negative_digit_comp(bigint &bigmant, adjusted_mantissa &am,
 // the actual digits. we then compare the big integer representations
 // of both, and use that to direct rounding.
 template <typename T, typename UC>
-inline FASTFLOAT_CONSTEXPR20 void digit_comp(
-    parsed_number_string_t<UC> const &num, adjusted_mantissa &am) noexcept {
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa digit_comp(
+    parsed_number_string_t<UC> const &num, adjusted_mantissa am) noexcept {
   // remove the invalid exponent bias
   am.power2 -= invalid_am_bias;
 
   bigint bigmant;
-  int32_t const sci_exp = scientific_exponent(num);
+  int16_t const sci_exp = scientific_exponent(num);
   
   uint16_t const digits = parse_mantissa<T, UC>(bigmant, num);
   // can't underflow, since digits is at most max_digits.
-  int32_t const exponent = sci_exp + 1 - digits;
+  int16_t const exponent = sci_exp + 1 - digits;
   if (exponent >= 0) {
-    positive_digit_comp<T>(bigmant, am, exponent);
+    return positive_digit_comp<T>(bigmant, am, exponent);
   } else {
-    negative_digit_comp<T>(bigmant, am, exponent);
+    return negative_digit_comp<T>(bigmant, am, exponent);
   }
 }
 
diff --git a/include/fast_float/float_common.h b/include/fast_float/float_common.h
index 53fc4b4..7de83f5 100644
--- a/include/fast_float/float_common.h
+++ b/include/fast_float/float_common.h
@@ -293,15 +293,15 @@ fastfloat_strncasecmp(UC const *actual_mixedcase, UC const *expected_lowercase,
 // a pointer and a length to a contiguous block of memory
 template <typename T> struct span {
   T const *ptr;
-  uint32_t length;
+  uint16_t length;
 
-  constexpr span(T const *_ptr, uint32_t _length) : ptr(_ptr), length(_length) {}
+  constexpr span(T const *_ptr, uint16_t _length) : ptr(_ptr), length(_length) {}
 
   constexpr span() : ptr(nullptr), length(0) {}
 
-  constexpr uint32_t len() const noexcept { return length; }
+  constexpr uint16_t len() const noexcept { return length; }
 
-  FASTFLOAT_CONSTEXPR14 const T &operator[](uint32_t index) const noexcept {
+  FASTFLOAT_CONSTEXPR14 const T &operator[](uint16_t index) const noexcept {
     FASTFLOAT_DEBUG_ASSERT(index < length);
     return ptr[index];
   }
@@ -318,8 +318,8 @@ struct value128 {
 };
 
 /* Helper C++14 constexpr generic implementation of leading_zeroes */
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int
-leading_zeroes_generic(uint64_t input_num, int last_bit = 0) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint8_t
+leading_zeroes_generic(uint64_t input_num, uint64_t last_bit = 0) {
   if (input_num & uint64_t(0xffffffff00000000)) {
     input_num >>= 32;
     last_bit |= 32;
@@ -343,11 +343,11 @@ leading_zeroes_generic(uint64_t input_num, int last_bit = 0) {
   if (input_num & uint64_t(0x2)) { /* input_num >>=  1; */
     last_bit |= 1;
   }
-  return 63 - last_bit;
+  return 63 - (uint8_t)last_bit;
 }
 
 /* result might be undefined when input_num is zero */
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20 int
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint8_t
 leading_zeroes(uint64_t input_num) noexcept {
   assert(input_num > 0);
   FASTFLOAT_ASSUME(input_num > 0);
@@ -360,12 +360,12 @@ leading_zeroes(uint64_t input_num) noexcept {
   // Search the mask data from most significant bit (MSB)
   // to least significant bit (LSB) for a set bit (1).
   _BitScanReverse64(&leading_zero, input_num);
-  return (int)(63 - leading_zero);
+  return (uint8_t)(63 - leading_zero);
 #else
-  return leading_zeroes_generic(input_num);
+  return (uint8_t)leading_zeroes_generic(input_num);
 #endif
 #else
-  return __builtin_clzll(input_num);
+  return (uint8_t)__builtin_clzll(input_num);
 #endif
 }
 
@@ -429,7 +429,7 @@ full_multiplication(uint64_t a, uint64_t b) noexcept {
 
 struct adjusted_mantissa {
   uint64_t mantissa;
-  int32_t power2; // a negative value indicates an invalid result
+  int16_t power2; // a negative value indicates an invalid result
   adjusted_mantissa() noexcept = default;
 
   constexpr bool operator==(adjusted_mantissa const &o) const noexcept {
diff --git a/include/fast_float/parse_number.h b/include/fast_float/parse_number.h
index 93c28a1..ca94b05 100644
--- a/include/fast_float/parse_number.h
+++ b/include/fast_float/parse_number.h
@@ -283,7 +283,7 @@ from_chars_advanced(parsed_number_string_t<UC> const &pns, T &value) noexcept {
   // and we have an invalid power (am.power2 < 0), then we need to go the long
   // way around again. This is very uncommon.
   if (am.power2 < 0) {
-    digit_comp<T>(pns, am);
+    am = digit_comp<T>(pns, am);
   }
   to_float(
 #ifndef FASTFLOAT_ONLY_POSITIVE_C_NUMBER_WO_INF_NAN