From 52618851fd7d03ba7a81cf8e72c0fae4eb477615 Mon Sep 17 00:00:00 2001 From: Lenard Szolnoki Date: Fri, 3 Mar 2023 22:43:52 +0000 Subject: [PATCH 1/2] Make all float_common.h functions constexpr in C++20 --- include/fast_float/float_common.h | 100 ++++++++++++++++++++++++------ 1 file changed, 81 insertions(+), 19 deletions(-) diff --git a/include/fast_float/float_common.h b/include/fast_float/float_common.h index 1e927dc..77e3e59 100644 --- a/include/fast_float/float_common.h +++ b/include/fast_float/float_common.h @@ -7,6 +7,25 @@ #include #include +#ifdef __has_include +#if __has_include() +#include +#endif +#endif + +#if __cpp_lib_bit_cast >= 201806L +#include +#define FASTFLOAT_HAS_BIT_CAST 1 +#else +#define FASTFLOAT_HAS_BIT_CAST 0 +#endif + +#if __cpp_lib_is_constant_evaluated >= 201811L +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 1 +#else +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 0 +#endif + #if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) \ || defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) \ || defined(__MINGW64__) \ @@ -98,8 +117,25 @@ #define FASTFLOAT_CONSTEXPR14 #endif +// Testing for relevant C++20 constexpr library features +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED \ + && FASTFLOAT_HAS_BIT_CAST \ + && __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/ +#define FASTFLOAT_CONSTEXPR20 constexpr +#else +#define FASTFLOAT_CONSTEXPR20 +#endif + namespace fast_float { +fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() { +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED + return std::is_constant_evaluated(); +#else + return false; +#endif +} + // Compares two ASCII strings in a case insensitive manner. inline FASTFLOAT_CONSTEXPR14 bool fastfloat_strncasecmp(const char *input1, const char *input2, size_t length) { @@ -139,9 +175,27 @@ struct value128 { constexpr value128() : low(0), high(0) {} }; +/* Helper C++11 constexpr generic implementation of leading_zeroes */ +fastfloat_really_inline constexpr +int leading_zeroes_generic(uint64_t input_num, int last_bit = 0) { + return ( + ((input_num & uint64_t(0xffffffff00000000)) && (input_num >>= 32, last_bit |= 32)), + ((input_num & uint64_t( 0xffff0000)) && (input_num >>= 16, last_bit |= 16)), + ((input_num & uint64_t( 0xff00)) && (input_num >>= 8, last_bit |= 8)), + ((input_num & uint64_t( 0xf0)) && (input_num >>= 4, last_bit |= 4)), + ((input_num & uint64_t( 0xc)) && (input_num >>= 2, last_bit |= 2)), + ((input_num & uint64_t( 0x2)) && (input_num >>= 1, last_bit |= 1)), + 63 - last_bit + ); +} + /* result might be undefined when input_num is zero */ -fastfloat_really_inline int leading_zeroes(uint64_t input_num) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +int leading_zeroes(uint64_t input_num) { assert(input_num > 0); + if (cpp20_and_in_constexpr()) { + return leading_zeroes_generic(input_num); + } #ifdef FASTFLOAT_VISUAL_STUDIO #if defined(_M_X64) || defined(_M_ARM64) unsigned long leading_zero = 0; @@ -150,31 +204,20 @@ fastfloat_really_inline int leading_zeroes(uint64_t input_num) { _BitScanReverse64(&leading_zero, input_num); return (int)(63 - leading_zero); #else - int last_bit = 0; - if(input_num & uint64_t(0xffffffff00000000)) input_num >>= 32, last_bit |= 32; - if(input_num & uint64_t( 0xffff0000)) input_num >>= 16, last_bit |= 16; - if(input_num & uint64_t( 0xff00)) input_num >>= 8, last_bit |= 8; - if(input_num & uint64_t( 0xf0)) input_num >>= 4, last_bit |= 4; - if(input_num & uint64_t( 0xc)) input_num >>= 2, last_bit |= 2; - if(input_num & uint64_t( 0x2)) input_num >>= 1, last_bit |= 1; - return 63 - last_bit; + return leading_zeroes_generic(input_num); #endif #else return __builtin_clzll(input_num); #endif } -#ifdef FASTFLOAT_32BIT - // slow emulation routine for 32-bit fastfloat_really_inline constexpr uint64_t emulu(uint32_t x, uint32_t y) { return x * (uint64_t)y; } -// slow emulation routine for 32-bit -#if !defined(__MINGW64__) -fastfloat_really_inline constexpr uint64_t _umul128(uint64_t ab, uint64_t cd, - uint64_t *hi) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 +uint64_t umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) { uint64_t ad = emulu((uint32_t)(ab >> 32), (uint32_t)cd); uint64_t bd = emulu((uint32_t)ab, (uint32_t)cd); uint64_t adbc = ad + emulu((uint32_t)ab, (uint32_t)(cd >> 32)); @@ -184,14 +227,28 @@ fastfloat_really_inline constexpr uint64_t _umul128(uint64_t ab, uint64_t cd, (adbc_carry << 32) + !!(lo < bd); return lo; } + +#ifdef FASTFLOAT_32BIT + +// slow emulation routine for 32-bit +#if !defined(__MINGW64__) +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 +uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) { + return umul128_generic(ab, cd, hi); +} #endif // !__MINGW64__ #endif // FASTFLOAT_32BIT // compute 64-bit a*b -fastfloat_really_inline value128 full_multiplication(uint64_t a, - uint64_t b) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +value128 full_multiplication(uint64_t a, uint64_t b) { + if (cpp20_and_in_constexpr()) { + value128 answer; + answer.low = umul128_generic(a, b, &answer.high); + return answer; + } value128 answer; #if defined(_M_ARM64) && !defined(__MINGW32__) // ARM64 has native support for 64-bit multiplications, no need to emulate @@ -205,7 +262,7 @@ fastfloat_really_inline value128 full_multiplication(uint64_t a, answer.low = uint64_t(r); answer.high = uint64_t(r >> 64); #else - #error Not implemented + answer.low = umul128_generic(a, b, &answer.high); #endif return answer; } @@ -442,12 +499,17 @@ template <> inline constexpr binary_format::equiv_uint } template -fastfloat_really_inline void to_float(bool negative, adjusted_mantissa am, T &value) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +void to_float(bool negative, adjusted_mantissa am, T &value) { using uint = typename binary_format::equiv_uint; uint word = (uint)am.mantissa; word |= uint(am.power2) << binary_format::mantissa_explicit_bits(); word |= uint(negative) << binary_format::sign_index(); +#if FASTFLOAT_HAS_BIT_CAST + value = std::bit_cast(word); +#else ::memcpy(&value, &word, sizeof(T)); +#endif } #if FASTFLOAT_SKIP_WHITE_SPACE // disabled by default From 5243dd97fe50d927f52b86c9388c8a3fde3b6230 Mon Sep 17 00:00:00 2001 From: Lenard Szolnoki Date: Fri, 3 Mar 2023 23:13:52 +0000 Subject: [PATCH 2/2] Constexpr bigint --- include/fast_float/bigint.h | 126 +++++++++++++++++++++--------------- 1 file changed, 74 insertions(+), 52 deletions(-) diff --git a/include/fast_float/bigint.h b/include/fast_float/bigint.h index 073ddf8..5a72caa 100644 --- a/include/fast_float/bigint.h +++ b/include/fast_float/bigint.h @@ -50,7 +50,7 @@ struct stackvec { stackvec &operator=(stackvec &&other) = delete; // create stack vector from existing limb span. - stackvec(limb_span s) { + FASTFLOAT_CONSTEXPR20 stackvec(limb_span s) { FASTFLOAT_ASSERT(try_extend(s)); } @@ -97,13 +97,13 @@ struct stackvec { } } // add items to the vector, from a span, without bounds checking - void extend_unchecked(limb_span s) noexcept { + FASTFLOAT_CONSTEXPR20 void extend_unchecked(limb_span s) noexcept { limb* ptr = data + length; - ::memcpy((void*)ptr, (const void*)s.ptr, sizeof(limb) * s.len()); + std::copy_n(s.ptr, s.len(), ptr); set_len(len() + s.len()); } // try to add items to the vector, returning if items were added - bool try_extend(limb_span s) noexcept { + FASTFLOAT_CONSTEXPR20 bool try_extend(limb_span s) noexcept { if (len() + s.len() <= capacity()) { extend_unchecked(s); return true; @@ -114,6 +114,7 @@ struct stackvec { // resize the vector, without bounds checking // if the new size is longer than the vector, assign value to each // appended item. + FASTFLOAT_CONSTEXPR20 void resize_unchecked(size_t new_len, limb value) noexcept { if (new_len > len()) { size_t count = new_len - len(); @@ -126,7 +127,7 @@ struct stackvec { } } // try to resize the vector, returning if the vector was resized. - bool try_resize(size_t new_len, limb value) noexcept { + FASTFLOAT_CONSTEXPR20 bool try_resize(size_t new_len, limb value) noexcept { if (new_len > capacity()) { return false; } else { @@ -160,14 +161,14 @@ uint64_t empty_hi64(bool& truncated) noexcept { return 0; } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint64_hi64(uint64_t r0, bool& truncated) noexcept { truncated = false; int shl = leading_zeroes(r0); return r0 << shl; } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept { int shl = leading_zeroes(r0); if (shl == 0) { @@ -180,19 +181,19 @@ uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept { } } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint32_hi64(uint32_t r0, bool& truncated) noexcept { return uint64_hi64(r0, truncated); } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint32_hi64(uint32_t r0, uint32_t r1, bool& truncated) noexcept { uint64_t x0 = r0; uint64_t x1 = r1; return uint64_hi64((x0 << 32) | x1, truncated); } -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noexcept { uint64_t x0 = r0; uint64_t x1 = r1; @@ -204,15 +205,16 @@ uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noe // we want an efficient operation. for msvc, where // we don't have built-in intrinsics, this is still // pretty fast. -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb scalar_add(limb x, limb y, bool& overflow) noexcept { limb z; - // gcc and clang #if defined(__has_builtin) #if __has_builtin(__builtin_add_overflow) - overflow = __builtin_add_overflow(x, y, &z); - return z; + if (!cpp20_and_in_constexpr()) { + overflow = __builtin_add_overflow(x, y, &z); + return z; + } #endif #endif @@ -223,7 +225,7 @@ limb scalar_add(limb x, limb y, bool& overflow) noexcept { } // multiply two small integers, getting both the high and low bits. -fastfloat_really_inline +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb scalar_mul(limb x, limb y, limb& carry) noexcept { #ifdef FASTFLOAT_64BIT_LIMB #if defined(__SIZEOF_INT128__) @@ -251,7 +253,8 @@ limb scalar_mul(limb x, limb y, limb& carry) noexcept { // add scalar value to bigint starting from offset. // used in grade school multiplication template -inline bool small_add_from(stackvec& vec, limb y, size_t start) noexcept { +inline FASTFLOAT_CONSTEXPR20 +bool small_add_from(stackvec& vec, limb y, size_t start) noexcept { size_t index = start; limb carry = y; bool overflow; @@ -268,13 +271,15 @@ inline bool small_add_from(stackvec& vec, limb y, size_t start) noexcept { // add scalar value to bigint. template -fastfloat_really_inline bool small_add(stackvec& vec, limb y) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +bool small_add(stackvec& vec, limb y) noexcept { return small_add_from(vec, y, 0); } // multiply bigint by scalar value. template -inline bool small_mul(stackvec& vec, limb y) noexcept { +inline FASTFLOAT_CONSTEXPR20 +bool small_mul(stackvec& vec, limb y) noexcept { limb carry = 0; for (size_t index = 0; index < vec.len(); index++) { vec[index] = scalar_mul(vec[index], y, carry); @@ -288,6 +293,7 @@ inline bool small_mul(stackvec& vec, limb y) noexcept { // add bigint to bigint starting from index. // used in grade school multiplication template +FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec& x, limb_span y, size_t start) noexcept { // the effective x buffer is from `xstart..x.len()`, so exit early // if we can't get that current range. @@ -318,12 +324,14 @@ bool large_add_from(stackvec& x, limb_span y, size_t start) noexcept { // add bigint to bigint. template -fastfloat_really_inline bool large_add_from(stackvec& x, limb_span y) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +bool large_add_from(stackvec& x, limb_span y) noexcept { return large_add_from(x, y, 0); } // grade-school multiplication algorithm template +FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec& x, limb_span y) noexcept { limb_span xs = limb_span(x.data, x.len()); stackvec z(xs); @@ -352,6 +360,7 @@ bool long_mul(stackvec& x, limb_span y) noexcept { // grade-school multiplication algorithm template +FASTFLOAT_CONSTEXPR20 bool large_mul(stackvec& x, limb_span y) noexcept { if (y.len() == 1) { FASTFLOAT_TRY(small_mul(x, y[0])); @@ -361,21 +370,52 @@ bool large_mul(stackvec& x, limb_span y) noexcept { return true; } +template +struct pow5_tables { + static constexpr uint32_t large_step = 135; + static constexpr uint64_t small_power_of_5[] = { + 1UL, 5UL, 25UL, 125UL, 625UL, 3125UL, 15625UL, 78125UL, 390625UL, + 1953125UL, 9765625UL, 48828125UL, 244140625UL, 1220703125UL, + 6103515625UL, 30517578125UL, 152587890625UL, 762939453125UL, + 3814697265625UL, 19073486328125UL, 95367431640625UL, 476837158203125UL, + 2384185791015625UL, 11920928955078125UL, 59604644775390625UL, + 298023223876953125UL, 1490116119384765625UL, 7450580596923828125UL, + }; +#ifdef FASTFLOAT_64BIT_LIMB + constexpr static limb large_power_of_5[] = { + 1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL, + 10482974169319127550UL, 198276706040285095UL}; +#else + constexpr static limb large_power_of_5[] = { + 4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U, + 1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U}; +#endif +}; + +template +constexpr uint32_t pow5_tables::large_step; + +template +constexpr uint64_t pow5_tables::small_power_of_5[]; + +template +constexpr limb pow5_tables::large_power_of_5[]; + // big integer type. implements a small subset of big integer // arithmetic, using simple algorithms since asymptotically // faster algorithms are slower for a small number of limbs. // all operations assume the big-integer is normalized. -struct bigint { +struct bigint : pow5_tables<> { // storage of the limbs, in little-endian order. stackvec vec; - bigint(): vec() {} + FASTFLOAT_CONSTEXPR20 bigint(): vec() {} bigint(const bigint &) = delete; bigint &operator=(const bigint &) = delete; bigint(bigint &&) = delete; bigint &operator=(bigint &&other) = delete; - bigint(uint64_t value): vec() { + FASTFLOAT_CONSTEXPR20 bigint(uint64_t value): vec() { #ifdef FASTFLOAT_64BIT_LIMB vec.push_unchecked(value); #else @@ -387,7 +427,7 @@ struct bigint { // get the high 64 bits from the vector, and if bits were truncated. // this is to get the significant digits for the float. - uint64_t hi64(bool& truncated) const noexcept { + FASTFLOAT_CONSTEXPR20 uint64_t hi64(bool& truncated) const noexcept { #ifdef FASTFLOAT_64BIT_LIMB if (vec.len() == 0) { return empty_hi64(truncated); @@ -419,7 +459,7 @@ struct bigint { // positive, this is larger, otherwise they are equal. // the limbs are stored in little-endian order, so we // must compare the limbs in ever order. - int compare(const bigint& other) const noexcept { + FASTFLOAT_CONSTEXPR20 int compare(const bigint& other) const noexcept { if (vec.len() > other.vec.len()) { return 1; } else if (vec.len() < other.vec.len()) { @@ -440,7 +480,7 @@ struct bigint { // shift left each limb n bits, carrying over to the new limb // returns true if we were able to shift all the digits. - bool shl_bits(size_t n) noexcept { + FASTFLOAT_CONSTEXPR20 bool shl_bits(size_t n) noexcept { // Internally, for each item, we shift left by n, and add the previous // right shifted limb-bits. // For example, we transform (for u8) shifted left 2, to: @@ -466,7 +506,7 @@ struct bigint { } // move the limbs left by `n` limbs. - bool shl_limbs(size_t n) noexcept { + FASTFLOAT_CONSTEXPR20 bool shl_limbs(size_t n) noexcept { FASTFLOAT_DEBUG_ASSERT(n != 0); if (n + vec.len() > vec.capacity()) { return false; @@ -487,7 +527,7 @@ struct bigint { } // move the limbs left by `n` bits. - bool shl(size_t n) noexcept { + FASTFLOAT_CONSTEXPR20 bool shl(size_t n) noexcept { size_t rem = n % limb_bits; size_t div = n / limb_bits; if (rem != 0) { @@ -500,7 +540,7 @@ struct bigint { } // get the number of leading zeros in the bigint. - int ctlz() const noexcept { + FASTFLOAT_CONSTEXPR20 int ctlz() const noexcept { if (vec.is_empty()) { return 0; } else { @@ -515,45 +555,27 @@ struct bigint { } // get the number of bits in the bigint. - int bit_length() const noexcept { + FASTFLOAT_CONSTEXPR20 int bit_length() const noexcept { int lz = ctlz(); return int(limb_bits * vec.len()) - lz; } - bool mul(limb y) noexcept { + FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept { return small_mul(vec, y); } - bool add(limb y) noexcept { + FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept { return small_add(vec, y); } // multiply as if by 2 raised to a power. - bool pow2(uint32_t exp) noexcept { + FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept { return shl(exp); } // multiply as if by 5 raised to a power. - bool pow5(uint32_t exp) noexcept { + FASTFLOAT_CONSTEXPR20 bool pow5(uint32_t exp) noexcept { // multiply by a power of 5 - static constexpr uint32_t large_step = 135; - static constexpr uint64_t small_power_of_5[] = { - 1UL, 5UL, 25UL, 125UL, 625UL, 3125UL, 15625UL, 78125UL, 390625UL, - 1953125UL, 9765625UL, 48828125UL, 244140625UL, 1220703125UL, - 6103515625UL, 30517578125UL, 152587890625UL, 762939453125UL, - 3814697265625UL, 19073486328125UL, 95367431640625UL, 476837158203125UL, - 2384185791015625UL, 11920928955078125UL, 59604644775390625UL, - 298023223876953125UL, 1490116119384765625UL, 7450580596923828125UL, - }; -#ifdef FASTFLOAT_64BIT_LIMB - constexpr static limb large_power_of_5[] = { - 1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL, - 10482974169319127550UL, 198276706040285095UL}; -#else - constexpr static limb large_power_of_5[] = { - 4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U, - 1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U}; -#endif size_t large_length = sizeof(large_power_of_5) / sizeof(limb); limb_span large = limb_span(large_power_of_5, large_length); while (exp >= large_step) { @@ -579,7 +601,7 @@ struct bigint { } // multiply as if by 10 raised to a power. - bool pow10(uint32_t exp) noexcept { + FASTFLOAT_CONSTEXPR20 bool pow10(uint32_t exp) noexcept { FASTFLOAT_TRY(pow5(exp)); return pow2(exp); }