From 5136b181bab4f9268c70a4c3a61ab00a897381bf Mon Sep 17 00:00:00 2001 From: Maya Warrier Date: Tue, 2 May 2023 01:41:49 -0400 Subject: [PATCH] Fixes and cleanup --- include/fast_float/ascii_number.h | 129 +++++++++++++++--------------- include/fast_float/float_common.h | 3 + 2 files changed, 66 insertions(+), 66 deletions(-) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index 82a06b1..a15c2ef 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -9,6 +9,8 @@ #include "float_common.h" +#define FASTFLOAT_SSE2 + #ifdef FASTFLOAT_SSE2 #include #endif @@ -16,6 +18,15 @@ namespace fast_float { +template +fastfloat_really_inline constexpr bool has_simd_opts() { +#ifdef FASTFLOAT_HAS_SIMD + return std::is_same::value; +#else + return false; +#endif +} + // Next function can be micro-optimized, but compilers are entirely // able to optimize it well. template @@ -34,42 +45,6 @@ fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) { | (val & 0x00000000000000FF) << 56; } - -#ifdef FASTFLOAT_SSE2 - -fastfloat_really_inline -__m128i load_packus_masks_c16(void) noexcept { -FASTFLOAT_SIMD_DISABLE_WARNINGS - static const char16_t masks[] = { 0xff, 0xff, 0xff, 0xff }; - return _mm_loadu_si128(reinterpret_cast(masks)); -FASTFLOAT_SIMD_RESTORE_WARNINGS -} - -// packus_masks is an argument only so its value may be preloaded. -// it should always come from load_packus_masks_c16(). -fastfloat_really_inline -uint64_t simd_read8_to_u64(const char16_t* chars, const __m128i packus_masks) { -FASTFLOAT_SIMD_DISABLE_WARNINGS - // process 4 and 4 chars simultaneously (loadu_si64 has high latency) - // with AVX512BW + AVX512VL, masking is not required as we can use cvtepi16_epi8 - const char* const p = reinterpret_cast(chars); - __m128i i1 = _mm_and_si128(_mm_loadu_si64(p), packus_masks); - __m128i i2 = _mm_and_si128(_mm_loadu_si64(p + 8), packus_masks); - __m128i packed = _mm_packus_epi16(i1, i2); - - uint64_t val; - _mm_storeu_si64(&val, _mm_shuffle_epi32(packed, 0x8)); - return val; -FASTFLOAT_SIMD_RESTORE_WARNINGS -} - -// https://quick-bench.com/q/fk6Y07KDGu8XZ9iUtQD8QJTc3Hg -fastfloat_really_inline -uint64_t simd_read8_to_u64(const char16_t* chars) { - return simd_read8_to_u64(chars, load_packus_masks_c16()); -} -#endif - // Read 8 UC into a u64. Truncates UC if not char. template fastfloat_really_inline FASTFLOAT_CONSTEXPR20 @@ -77,7 +52,7 @@ uint64_t read8_to_u64(const UC *chars) { if (cpp20_and_in_constexpr() || !std::is_same::value) { uint64_t val = 0; for(int i = 0; i < 8; ++i) { - val |= uint64_t(char(*chars)) << (i*8); + val |= uint64_t(uint8_t(*chars)) << (i*8); ++chars; } return val; @@ -91,6 +66,35 @@ uint64_t read8_to_u64(const UC *chars) { return val; } +#ifdef FASTFLOAT_SSE2 + +fastfloat_really_inline +uint64_t simd_read8_to_u64(const char16_t* chars) { +FASTFLOAT_SIMD_DISABLE_WARNINGS + static const char16_t kmasks[] = { 0xff, 0xff, 0xff, 0xff }; + const __m128i masks = _mm_loadu_si128(reinterpret_cast(kmasks)); + + // pipeline 4 and 4 chars at the same time (since loadu_si64 has high latency) + // todo: with AVX512BW + AVX512VL, can use cvtepi16_epi8 instead + const char* const p = reinterpret_cast(chars); + __m128i i1 = _mm_and_si128(_mm_loadu_si64(p), masks); + __m128i i2 = _mm_and_si128(_mm_loadu_si64(p + 8), masks); + __m128i packed = _mm_packus_epi16(i1, i2); + + uint64_t val; + _mm_storeu_si64(&val, _mm_shuffle_epi32(packed, 0x8)); + return val; +FASTFLOAT_SIMD_RESTORE_WARNINGS +} +#endif + +// dummy for compile +template ())> +uint64_t simd_read8_to_u64(UC const*) { + return 0; +} + + fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void write_u64(uint8_t *chars, uint64_t val) { if (cpp20_and_in_constexpr()) { @@ -122,28 +126,13 @@ uint32_t parse_eight_digits_unrolled(uint64_t val) { // Call this if chars are definitely 8 digits. +template fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -uint32_t parse_eight_digits_unrolled(const char* chars) noexcept { - return parse_eight_digits_unrolled(read8_to_u64(chars)); -} - -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -uint32_t parse_eight_digits_unrolled(const char16_t* chars) noexcept { - if (cpp20_and_in_constexpr() || !has_simd()) { - return parse_eight_digits_unrolled(read8_to_u64(chars)); +uint32_t parse_eight_digits_unrolled(UC const * chars) noexcept { + if (cpp20_and_in_constexpr() || !has_simd_opts()) { + return parse_eight_digits_unrolled(read8_to_u64(chars)); // truncation okay } -#ifdef FASTFLOAT_HAS_SIMD return parse_eight_digits_unrolled(simd_read8_to_u64(chars)); -#else - // never reaches here, removes warning - return 0; -#endif -} - -// todo, no simd optimization yet -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -uint32_t parse_eight_digits_unrolled(const char32_t* chars) noexcept { - return parse_eight_digits_unrolled(read8_to_u64(chars)); } @@ -163,8 +152,12 @@ bool parse_if_eight_digits_unrolled(const char* chars, uint64_t& i) noexcept { } // Call this if chars might not be 8 digits. -// Using this (instead of is_made_of_eight_digits_fast() then parse_eight_digits_unrolled()) +// Using this (instead of is_made_of_eight_digits_fast() and parse_eight_digits_unrolled()) // ensures we don't load SIMD registers twice. +// +// Benchmark: +// https://quick-bench.com/q/Bbn0B4WmZsdgS3qDZWpggAY-jgs +// fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool parse_if_eight_digits_unrolled(const char16_t* chars, uint64_t& i) noexcept { #ifdef FASTFLOAT_SSE2 @@ -173,7 +166,6 @@ bool parse_if_eight_digits_unrolled(const char16_t* chars, uint64_t& i) noexcept } FASTFLOAT_SIMD_DISABLE_WARNINGS const __m128i data = _mm_loadu_si128(reinterpret_cast(chars)); - const __m128i packus_masks = load_packus_masks_c16(); // be optimistic, preload // (x - '0') <= 9 // http://0x80.pl/articles/simd-parsing-int-sequences.html @@ -181,7 +173,7 @@ FASTFLOAT_SIMD_DISABLE_WARNINGS const __m128i t1 = _mm_cmpgt_epi16(t0, _mm_set1_epi16(-119)); if (_mm_movemask_epi8(t1) == 0) { - uint64_t digits = simd_read8_to_u64(chars, packus_masks); + uint64_t digits = simd_read8_to_u64(chars); i = i * 100000000 + parse_eight_digits_unrolled(digits); return true; } @@ -189,6 +181,8 @@ FASTFLOAT_SIMD_DISABLE_WARNINGS FASTFLOAT_SIMD_RESTORE_WARNINGS #else // No SIMD available + + (void)chars; (void)i; // unused return false; #endif } @@ -212,8 +206,10 @@ struct parsed_number_string_t { span integer{}; // non-nullable span fraction{}; // nullable }; + using byte_span = span; using parsed_number_string = parsed_number_string_t; + // Assuming that you use no more than 19 digits, this will // parse an ASCII string. template @@ -265,6 +261,7 @@ parsed_number_string_t parse_number_string(UC const *p, UC const * pend, par while ((p != pend) && is_integer(*p)) { uint8_t digit = uint8_t(*p - UC('0')); ++p; + i = i * 10 + digit; // in rare cases, this will overflow, but that's ok } exponent = before - p; answer.fraction = span(before, size_t(p - before)); @@ -336,20 +333,20 @@ parsed_number_string_t parse_number_string(UC const *p, UC const * pend, par return answer; } -template +template fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -void parse_truncated_number_string(parsed_number_string& ps) +void parse_truncated_number_string(parsed_number_string_t& ps) { // Let us start again, this time, avoiding overflows. // We don't need to check if is_integer, since we use the // pre-tokenized spans. uint64_t i = 0; int64_t exponent = 0; - const CharT* p = ps.integer.ptr; - const CharT* const int_end = p + ps.integer.len(); + const UC* p = ps.integer.ptr; + const UC* const int_end = p + ps.integer.len(); const uint64_t minimal_nineteen_digit_integer{1000000000000000000}; while ((i < minimal_nineteen_digit_integer) && (p != int_end)) { - i = i * 10 + uint64_t(*p - CharT('0')); + i = i * 10 + uint64_t(*p - UC('0')); ++p; } if (i >= minimal_nineteen_digit_integer) { // We have a big integers @@ -357,9 +354,9 @@ void parse_truncated_number_string(parsed_number_string& ps) } else { // We have a value with a fractional component. p = ps.fraction.ptr; - const CharT* const frac_end = p + ps.fraction.len(); + const UC* const frac_end = p + ps.fraction.len(); while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) { - i = i * 10 + uint64_t(*p - CharT('0')); + i = i * 10 + uint64_t(*p - UC('0')); ++p; } exponent = ps.fraction.ptr - p + ps.exp_number; diff --git a/include/fast_float/float_common.h b/include/fast_float/float_common.h index f63a090..175389f 100644 --- a/include/fast_float/float_common.h +++ b/include/fast_float/float_common.h @@ -117,6 +117,9 @@ // rust style `try!()` macro, or `?` operator #define FASTFLOAT_TRY(x) { if (!(x)) return false; } +#define FASTFLOAT_ENABLE_IF(test) typename std::enable_if<(test), int>::type = 0 + + namespace fast_float { fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() {