From 52618851fd7d03ba7a81cf8e72c0fae4eb477615 Mon Sep 17 00:00:00 2001
From: Lenard Szolnoki <leni536@gmail.com>
Date: Fri, 3 Mar 2023 22:43:52 +0000
Subject: [PATCH 1/2] Make all float_common.h functions constexpr in C++20

---
 include/fast_float/float_common.h | 100 ++++++++++++++++++++++++------
 1 file changed, 81 insertions(+), 19 deletions(-)
diff --git a/include/fast_float/float_common.h b/include/fast_float/float_common.h
index 1e927dc..77e3e59 100644
--- a/include/fast_float/float_common.h
+++ b/include/fast_float/float_common.h
@@ -7,6 +7,25 @@
 #include <cstring>
 #include <type_traits>
 
+#ifdef __has_include
+#if __has_include(<version>)
+#include <version>
+#endif
+#endif
+
+#if __cpp_lib_bit_cast >= 201806L
+#include <bit>
+#define FASTFLOAT_HAS_BIT_CAST 1
+#else
+#define FASTFLOAT_HAS_BIT_CAST 0
+#endif
+
+#if __cpp_lib_is_constant_evaluated >= 201811L
+#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 1
+#else
+#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 0
+#endif
+
 #if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64)   \
        || defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) \
        || defined(__MINGW64__)                                          \
@@ -98,8 +117,25 @@
 #define FASTFLOAT_CONSTEXPR14
 #endif
 
+// Testing for relevant C++20 constexpr library features
+#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED \
+    && FASTFLOAT_HAS_BIT_CAST \
+    && __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/
+#define FASTFLOAT_CONSTEXPR20 constexpr
+#else
+#define FASTFLOAT_CONSTEXPR20
+#endif
+
 namespace fast_float {
 
+fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() {
+#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED
+  return std::is_constant_evaluated();
+#else
+  return false;
+#endif
+}
+
 // Compares two ASCII strings in a case insensitive manner.
 inline FASTFLOAT_CONSTEXPR14 bool
 fastfloat_strncasecmp(const char *input1, const char *input2, size_t length) {
@@ -139,9 +175,27 @@ struct value128 {
   constexpr value128() : low(0), high(0) {}
 };
 
+/* Helper C++11 constexpr generic implementation of leading_zeroes */
+fastfloat_really_inline constexpr
+int leading_zeroes_generic(uint64_t input_num, int last_bit = 0) {
+  return (
+    ((input_num & uint64_t(0xffffffff00000000)) && (input_num >>= 32, last_bit |= 32)),
+    ((input_num & uint64_t(        0xffff0000)) && (input_num >>= 16, last_bit |= 16)),
+    ((input_num & uint64_t(            0xff00)) && (input_num >>=  8, last_bit |=  8)),
+    ((input_num & uint64_t(              0xf0)) && (input_num >>=  4, last_bit |=  4)),
+    ((input_num & uint64_t(               0xc)) && (input_num >>=  2, last_bit |=  2)),
+    ((input_num & uint64_t(               0x2)) && (input_num >>=  1, last_bit |=  1)),
+    63 - last_bit
+  );
+}
+
 /* result might be undefined when input_num is zero */
-fastfloat_really_inline int leading_zeroes(uint64_t input_num) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
+int leading_zeroes(uint64_t input_num) {
   assert(input_num > 0);
+  if (cpp20_and_in_constexpr()) {
+    return leading_zeroes_generic(input_num);
+  }
 #ifdef FASTFLOAT_VISUAL_STUDIO
   #if defined(_M_X64) || defined(_M_ARM64)
   unsigned long leading_zero = 0;
@@ -150,31 +204,20 @@ fastfloat_really_inline int leading_zeroes(uint64_t input_num) {
   _BitScanReverse64(&leading_zero, input_num);
   return (int)(63 - leading_zero);
   #else
-  int last_bit = 0;
-  if(input_num & uint64_t(0xffffffff00000000)) input_num >>= 32, last_bit |= 32;
-  if(input_num & uint64_t(        0xffff0000)) input_num >>= 16, last_bit |= 16;
-  if(input_num & uint64_t(            0xff00)) input_num >>=  8, last_bit |=  8;
-  if(input_num & uint64_t(              0xf0)) input_num >>=  4, last_bit |=  4;
-  if(input_num & uint64_t(               0xc)) input_num >>=  2, last_bit |=  2;
-  if(input_num & uint64_t(               0x2)) input_num >>=  1, last_bit |=  1;
-  return 63 - last_bit;
+  return leading_zeroes_generic(input_num);
   #endif
 #else
   return __builtin_clzll(input_num);
 #endif
 }
 
-#ifdef FASTFLOAT_32BIT
-
 // slow emulation routine for 32-bit
 fastfloat_really_inline constexpr uint64_t emulu(uint32_t x, uint32_t y) {
     return x * (uint64_t)y;
 }
 
-// slow emulation routine for 32-bit
-#if !defined(__MINGW64__)
-fastfloat_really_inline constexpr uint64_t _umul128(uint64_t ab, uint64_t cd,
-                                                    uint64_t *hi) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14
+uint64_t umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) {
   uint64_t ad = emulu((uint32_t)(ab >> 32), (uint32_t)cd);
   uint64_t bd = emulu((uint32_t)ab, (uint32_t)cd);
   uint64_t adbc = ad + emulu((uint32_t)ab, (uint32_t)(cd >> 32));
@@ -184,14 +227,28 @@ fastfloat_really_inline constexpr uint64_t _umul128(uint64_t ab, uint64_t cd,
         (adbc_carry << 32) + !!(lo < bd);
   return lo;
 }
+
+#ifdef FASTFLOAT_32BIT
+
+// slow emulation routine for 32-bit
+#if !defined(__MINGW64__)
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14
+uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
+  return umul128_generic(ab, cd, hi);
+}
 #endif // !__MINGW64__
 
 #endif // FASTFLOAT_32BIT
 
 
 // compute 64-bit a*b
-fastfloat_really_inline value128 full_multiplication(uint64_t a,
-                                                     uint64_t b) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
+value128 full_multiplication(uint64_t a, uint64_t b) {
+  if (cpp20_and_in_constexpr()) {
+    value128 answer;
+    answer.low = umul128_generic(a, b, &answer.high);
+    return answer;
+  }
   value128 answer;
 #if defined(_M_ARM64) && !defined(__MINGW32__)
   // ARM64 has native support for 64-bit multiplications, no need to emulate
@@ -205,7 +262,7 @@ fastfloat_really_inline value128 full_multiplication(uint64_t a,
   answer.low = uint64_t(r);
   answer.high = uint64_t(r >> 64);
 #else
-  #error Not implemented
+  answer.low = umul128_generic(a, b, &answer.high);
 #endif
   return answer;
 }
@@ -442,12 +499,17 @@ template <> inline constexpr binary_format<double>::equiv_uint
 }
 
 template<typename T>
-fastfloat_really_inline void to_float(bool negative, adjusted_mantissa am, T &value) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
+void to_float(bool negative, adjusted_mantissa am, T &value) {
   using uint = typename binary_format<T>::equiv_uint;
   uint word = (uint)am.mantissa;
   word |= uint(am.power2) << binary_format<T>::mantissa_explicit_bits();
   word |= uint(negative) << binary_format<T>::sign_index();
+#if FASTFLOAT_HAS_BIT_CAST
+  value = std::bit_cast<T>(word);
+#else
   ::memcpy(&value, &word, sizeof(T));
+#endif
 }
 
 #if FASTFLOAT_SKIP_WHITE_SPACE // disabled by default

From 5243dd97fe50d927f52b86c9388c8a3fde3b6230 Mon Sep 17 00:00:00 2001
From: Lenard Szolnoki <leni536@gmail.com>
Date: Fri, 3 Mar 2023 23:13:52 +0000
Subject: [PATCH 2/2] Constexpr bigint

---
 include/fast_float/bigint.h | 126 +++++++++++++++++++++---------------
 1 file changed, 74 insertions(+), 52 deletions(-)

diff --git a/include/fast_float/bigint.h b/include/fast_float/bigint.h
index 073ddf8..5a72caa 100644
--- a/include/fast_float/bigint.h
+++ b/include/fast_float/bigint.h
@@ -50,7 +50,7 @@ struct stackvec {
   stackvec &operator=(stackvec &&other) = delete;
 
   // create stack vector from existing limb span.
-  stackvec(limb_span s) {
+  FASTFLOAT_CONSTEXPR20 stackvec(limb_span s) {
     FASTFLOAT_ASSERT(try_extend(s));
   }
 
@@ -97,13 +97,13 @@ struct stackvec {
     }
   }
   // add items to the vector, from a span, without bounds checking
-  void extend_unchecked(limb_span s) noexcept {
+  FASTFLOAT_CONSTEXPR20 void extend_unchecked(limb_span s) noexcept {
     limb* ptr = data + length;
-    ::memcpy((void*)ptr, (const void*)s.ptr, sizeof(limb) * s.len());
+    std::copy_n(s.ptr, s.len(), ptr);
     set_len(len() + s.len());
   }
   // try to add items to the vector, returning if items were added
-  bool try_extend(limb_span s) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool try_extend(limb_span s) noexcept {
     if (len() + s.len() <= capacity()) {
       extend_unchecked(s);
       return true;
@@ -114,6 +114,7 @@ struct stackvec {
   // resize the vector, without bounds checking
   // if the new size is longer than the vector, assign value to each
   // appended item.
+  FASTFLOAT_CONSTEXPR20
   void resize_unchecked(size_t new_len, limb value) noexcept {
     if (new_len > len()) {
       size_t count = new_len - len();
@@ -126,7 +127,7 @@ struct stackvec {
     }
   }
   // try to resize the vector, returning if the vector was resized.
-  bool try_resize(size_t new_len, limb value) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool try_resize(size_t new_len, limb value) noexcept {
     if (new_len > capacity()) {
       return false;
     } else {
@@ -160,14 +161,14 @@ uint64_t empty_hi64(bool& truncated) noexcept {
   return 0;
 }
 
-fastfloat_really_inline
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
 uint64_t uint64_hi64(uint64_t r0, bool& truncated) noexcept {
   truncated = false;
   int shl = leading_zeroes(r0);
   return r0 << shl;
 }
 
-fastfloat_really_inline
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
 uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept {
   int shl = leading_zeroes(r0);
   if (shl == 0) {
@@ -180,19 +181,19 @@ uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept {
   }
 }
 
-fastfloat_really_inline
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
 uint64_t uint32_hi64(uint32_t r0, bool& truncated) noexcept {
   return uint64_hi64(r0, truncated);
 }
 
-fastfloat_really_inline
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
 uint64_t uint32_hi64(uint32_t r0, uint32_t r1, bool& truncated) noexcept {
   uint64_t x0 = r0;
   uint64_t x1 = r1;
   return uint64_hi64((x0 << 32) | x1, truncated);
 }
 
-fastfloat_really_inline
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
 uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noexcept {
   uint64_t x0 = r0;
   uint64_t x1 = r1;
@@ -204,15 +205,16 @@ uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noe
 // we want an efficient operation. for msvc, where
 // we don't have built-in intrinsics, this is still
 // pretty fast.
-fastfloat_really_inline
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
 limb scalar_add(limb x, limb y, bool& overflow) noexcept {
   limb z;
-
 // gcc and clang
 #if defined(__has_builtin)
   #if __has_builtin(__builtin_add_overflow)
-    overflow = __builtin_add_overflow(x, y, &z);
-    return z;
+    if (!cpp20_and_in_constexpr()) {
+      overflow = __builtin_add_overflow(x, y, &z);
+      return z;
+    }
   #endif
 #endif
 
@@ -223,7 +225,7 @@ limb scalar_add(limb x, limb y, bool& overflow) noexcept {
 }
 
 // multiply two small integers, getting both the high and low bits.
-fastfloat_really_inline
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
 limb scalar_mul(limb x, limb y, limb& carry) noexcept {
 #ifdef FASTFLOAT_64BIT_LIMB
   #if defined(__SIZEOF_INT128__)
@@ -251,7 +253,8 @@ limb scalar_mul(limb x, limb y, limb& carry) noexcept {
 // add scalar value to bigint starting from offset.
 // used in grade school multiplication
 template <uint16_t size>
-inline bool small_add_from(stackvec<size>& vec, limb y, size_t start) noexcept {
+inline FASTFLOAT_CONSTEXPR20
+bool small_add_from(stackvec<size>& vec, limb y, size_t start) noexcept {
   size_t index = start;
   limb carry = y;
   bool overflow;
@@ -268,13 +271,15 @@ inline bool small_add_from(stackvec<size>& vec, limb y, size_t start) noexcept {
 
 // add scalar value to bigint.
 template <uint16_t size>
-fastfloat_really_inline bool small_add(stackvec<size>& vec, limb y) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
+bool small_add(stackvec<size>& vec, limb y) noexcept {
   return small_add_from(vec, y, 0);
 }
 
 // multiply bigint by scalar value.
 template <uint16_t size>
-inline bool small_mul(stackvec<size>& vec, limb y) noexcept {
+inline FASTFLOAT_CONSTEXPR20
+bool small_mul(stackvec<size>& vec, limb y) noexcept {
   limb carry = 0;
   for (size_t index = 0; index < vec.len(); index++) {
     vec[index] = scalar_mul(vec[index], y, carry);
@@ -288,6 +293,7 @@ inline bool small_mul(stackvec<size>& vec, limb y) noexcept {
 // add bigint to bigint starting from index.
 // used in grade school multiplication
 template <uint16_t size>
+FASTFLOAT_CONSTEXPR20
 bool large_add_from(stackvec<size>& x, limb_span y, size_t start) noexcept {
   // the effective x buffer is from `xstart..x.len()`, so exit early
   // if we can't get that current range.
@@ -318,12 +324,14 @@ bool large_add_from(stackvec<size>& x, limb_span y, size_t start) noexcept {
 
 // add bigint to bigint.
 template <uint16_t size>
-fastfloat_really_inline bool large_add_from(stackvec<size>& x, limb_span y) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
+bool large_add_from(stackvec<size>& x, limb_span y) noexcept {
   return large_add_from(x, y, 0);
 }
 
 // grade-school multiplication algorithm
 template <uint16_t size>
+FASTFLOAT_CONSTEXPR20
 bool long_mul(stackvec<size>& x, limb_span y) noexcept {
   limb_span xs = limb_span(x.data, x.len());
   stackvec<size> z(xs);
@@ -352,6 +360,7 @@ bool long_mul(stackvec<size>& x, limb_span y) noexcept {
 
 // grade-school multiplication algorithm
 template <uint16_t size>
+FASTFLOAT_CONSTEXPR20
 bool large_mul(stackvec<size>& x, limb_span y) noexcept {
   if (y.len() == 1) {
     FASTFLOAT_TRY(small_mul(x, y[0]));
@@ -361,21 +370,52 @@ bool large_mul(stackvec<size>& x, limb_span y) noexcept {
   return true;
 }
 
+template <typename = void>
+struct pow5_tables {
+  static constexpr uint32_t large_step = 135;
+  static constexpr uint64_t small_power_of_5[] = {
+    1UL, 5UL, 25UL, 125UL, 625UL, 3125UL, 15625UL, 78125UL, 390625UL,
+    1953125UL, 9765625UL, 48828125UL, 244140625UL, 1220703125UL,
+    6103515625UL, 30517578125UL, 152587890625UL, 762939453125UL,
+    3814697265625UL, 19073486328125UL, 95367431640625UL, 476837158203125UL,
+    2384185791015625UL, 11920928955078125UL, 59604644775390625UL,
+    298023223876953125UL, 1490116119384765625UL, 7450580596923828125UL,
+  };
+#ifdef FASTFLOAT_64BIT_LIMB
+  constexpr static limb large_power_of_5[] = {
+    1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL,
+    10482974169319127550UL, 198276706040285095UL};
+#else
+  constexpr static limb large_power_of_5[] = {
+    4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U,
+    1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U};
+#endif
+};
+
+template <typename T>
+constexpr uint32_t pow5_tables<T>::large_step;
+
+template <typename T>
+constexpr uint64_t pow5_tables<T>::small_power_of_5[];
+
+template <typename T>
+constexpr limb pow5_tables<T>::large_power_of_5[];
+
 // big integer type. implements a small subset of big integer
 // arithmetic, using simple algorithms since asymptotically
 // faster algorithms are slower for a small number of limbs.
 // all operations assume the big-integer is normalized.
-struct bigint {
+struct bigint : pow5_tables<> {
   // storage of the limbs, in little-endian order.
   stackvec<bigint_limbs> vec;
 
-  bigint(): vec() {}
+  FASTFLOAT_CONSTEXPR20 bigint(): vec() {}
   bigint(const bigint &) = delete;
   bigint &operator=(const bigint &) = delete;
   bigint(bigint &&) = delete;
   bigint &operator=(bigint &&other) = delete;
 
-  bigint(uint64_t value): vec() {
+  FASTFLOAT_CONSTEXPR20 bigint(uint64_t value): vec() {
 #ifdef FASTFLOAT_64BIT_LIMB
     vec.push_unchecked(value);
 #else
@@ -387,7 +427,7 @@ struct bigint {
 
   // get the high 64 bits from the vector, and if bits were truncated.
   // this is to get the significant digits for the float.
-  uint64_t hi64(bool& truncated) const noexcept {
+  FASTFLOAT_CONSTEXPR20 uint64_t hi64(bool& truncated) const noexcept {
 #ifdef FASTFLOAT_64BIT_LIMB
     if (vec.len() == 0) {
       return empty_hi64(truncated);
@@ -419,7 +459,7 @@ struct bigint {
   // positive, this is larger, otherwise they are equal.
   // the limbs are stored in little-endian order, so we
   // must compare the limbs in ever order.
-  int compare(const bigint& other) const noexcept {
+  FASTFLOAT_CONSTEXPR20 int compare(const bigint& other) const noexcept {
     if (vec.len() > other.vec.len()) {
       return 1;
     } else if (vec.len() < other.vec.len()) {
@@ -440,7 +480,7 @@ struct bigint {
 
   // shift left each limb n bits, carrying over to the new limb
   // returns true if we were able to shift all the digits.
-  bool shl_bits(size_t n) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool shl_bits(size_t n) noexcept {
     // Internally, for each item, we shift left by n, and add the previous
     // right shifted limb-bits.
     // For example, we transform (for u8) shifted left 2, to:
@@ -466,7 +506,7 @@ struct bigint {
   }
 
   // move the limbs left by `n` limbs.
-  bool shl_limbs(size_t n) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool shl_limbs(size_t n) noexcept {
     FASTFLOAT_DEBUG_ASSERT(n != 0);
     if (n + vec.len() > vec.capacity()) {
       return false;
@@ -487,7 +527,7 @@ struct bigint {
   }
 
   // move the limbs left by `n` bits.
-  bool shl(size_t n) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool shl(size_t n) noexcept {
     size_t rem = n % limb_bits;
     size_t div = n / limb_bits;
     if (rem != 0) {
@@ -500,7 +540,7 @@ struct bigint {
   }
 
   // get the number of leading zeros in the bigint.
-  int ctlz() const noexcept {
+  FASTFLOAT_CONSTEXPR20 int ctlz() const noexcept {
     if (vec.is_empty()) {
       return 0;
     } else {
@@ -515,45 +555,27 @@ struct bigint {
   }
 
   // get the number of bits in the bigint.
-  int bit_length() const noexcept {
+  FASTFLOAT_CONSTEXPR20 int bit_length() const noexcept {
     int lz = ctlz();
     return int(limb_bits * vec.len()) - lz;
   }
 
-  bool mul(limb y) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept {
     return small_mul(vec, y);
   }
 
-  bool add(limb y) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept {
     return small_add(vec, y);
   }
 
   // multiply as if by 2 raised to a power.
-  bool pow2(uint32_t exp) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept {
     return shl(exp);
   }
 
   // multiply as if by 5 raised to a power.
-  bool pow5(uint32_t exp) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool pow5(uint32_t exp) noexcept {
     // multiply by a power of 5
-    static constexpr uint32_t large_step = 135;
-    static constexpr uint64_t small_power_of_5[] = {
-      1UL, 5UL, 25UL, 125UL, 625UL, 3125UL, 15625UL, 78125UL, 390625UL,
-      1953125UL, 9765625UL, 48828125UL, 244140625UL, 1220703125UL,
-      6103515625UL, 30517578125UL, 152587890625UL, 762939453125UL,
-      3814697265625UL, 19073486328125UL, 95367431640625UL, 476837158203125UL,
-      2384185791015625UL, 11920928955078125UL, 59604644775390625UL,
-      298023223876953125UL, 1490116119384765625UL, 7450580596923828125UL,
-    };
-#ifdef FASTFLOAT_64BIT_LIMB
-    constexpr static limb large_power_of_5[] = {
-      1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL,
-      10482974169319127550UL, 198276706040285095UL};
-#else
-    constexpr static limb large_power_of_5[] = {
-      4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U,
-      1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U};
-#endif
     size_t large_length = sizeof(large_power_of_5) / sizeof(limb);
     limb_span large = limb_span(large_power_of_5, large_length);
     while (exp >= large_step) {
@@ -579,7 +601,7 @@ struct bigint {
   }
 
   // multiply as if by 10 raised to a power.
-  bool pow10(uint32_t exp) noexcept {
+  FASTFLOAT_CONSTEXPR20 bool pow10(uint32_t exp) noexcept {
     FASTFLOAT_TRY(pow5(exp));
     return pow2(exp);
   }