From 8a9a9d538a2721ad6d8143230a08104c30032a37 Mon Sep 17 00:00:00 2001 From: Maya Warrier Date: Sat, 1 Apr 2023 22:43:00 -0400 Subject: [PATCH] SIMD optimization to parse 8 char16_t at a time --- include/fast_float/ascii_number.h | 77 +++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 15 deletions(-) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index 9c2ed8a..3b192dc 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -35,8 +35,7 @@ fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) { } fastfloat_really_inline -uint64_t fast_read_u64(const char* chars) -{ +uint64_t fast_read_u64(const char* chars) { uint64_t val; ::memcpy(&val, chars, sizeof(uint64_t)); return val; @@ -44,8 +43,7 @@ uint64_t fast_read_u64(const char* chars) // https://quick-bench.com/q/fk6Y07KDGu8XZ9iUtQD8QJTc3Hg fastfloat_really_inline -uint64_t fast_read_u64(const char16_t* chars) -{ +uint64_t fast_read_u64(const char16_t* chars) { #if FASTFLOAT_SSE2 FASTFLOAT_SIMD_DISABLE_WARNINGS static const char16_t masks[] = {0xff, 0xff, 0xff, 0xff}; @@ -63,7 +61,7 @@ FASTFLOAT_SIMD_DISABLE_WARNINGS return val; FASTFLOAT_SIMD_RESTORE_WARNINGS #else - alignas(8) unsigned char bytes[8]; + unsigned char bytes[8]; for (int i = 0; i < 8; ++i) bytes[i] = (unsigned char)chars[i]; @@ -122,24 +120,74 @@ uint32_t parse_eight_digits_unrolled(uint64_t val) { return uint32_t(val); } -template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -uint32_t parse_eight_digits_unrolled(const CharT *chars) noexcept { - return parse_eight_digits_unrolled(read_u64(chars)); -} - // credit @aqrit fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val) noexcept { return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) & 0x8080808080808080)); } -template fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -bool is_made_of_eight_digits_fast(const CharT *chars) noexcept { +uint32_t parse_eight_digits_unrolled(const char* chars) noexcept { + return parse_eight_digits_unrolled(read_u64(chars)); +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +bool is_made_of_eight_digits_fast(const char *chars) noexcept { return is_made_of_eight_digits_fast(read_u64(chars)); } +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +bool parse_if_eight_digits_unrolled(const char* chars, std::uint64_t& i) noexcept { + const bool all = is_made_of_eight_digits_fast(chars); + if (all) i = i * 100000000 * parse_eight_digits_unrolled(chars); + return all; +} + +// http://0x80.pl/articles/simd-parsing-int-sequences.html +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 +bool parse_if_eight_digits_unrolled(const char16_t* chars, std::uint64_t& i) noexcept { + if (cpp20_and_in_constexpr() || !FASTFLOAT_SSE2) { + for (int i = 0; i < 8; ++i) { + if (chars[i] < u'0' || chars[i] > u'9') + return false; + } + i = i * 100000000 + parse_eight_digits_unrolled(read_u64(chars)); + return true; + } +#if FASTFLOAT_SSE2 +FASTFLOAT_SIMD_DISABLE_WARNINGS + const __m128i data = _mm_loadu_si128(reinterpret_cast(chars)); + + // (x - '0') <= 9 + const __m128i t0 = _mm_sub_epi16(data, _mm_set1_epi16(80)); + const __m128i t1 = _mm_cmpgt_epi16(t0, _mm_set1_epi16(-119)); + const bool is_digits = _mm_movemask_epi8(t1) == 0; + + if (is_digits) { + // x - '0' + const __m128i s1digits16 = _mm_sub_epi16(data, _mm_set1_epi16('0')); + // 10 * x(b) + x(b-1) -> 2 digit numbers + const __m128i s2digits32 = _mm_madd_epi16(s1digits16, _mm_setr_epi16(10, 1, 10, 1, 10, 1, 10, 1); + const __m128i s2digits16 = _mm_packus_epi16(s2digits32, s2digits32); + // 100 * x(b) + x(b-1) -> 4 digit numbers + const __m128i s4digits32 = _mm_madd_epi16(s2digits16, _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1)); + const __m128i s4digits16 = _mm_packus_epi16(s4digits32, s4digits32); + // 10000 * x(b) + x(b-1) -> 8 digit number + const __m128i s8digits32 = _mm_madd_epi16(s4digits16, _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1)); + + uint32_t value; + _mm_storeu_si32(&value, s8digits32); + + i = i * 100000000 + value; + return true; + } + else return false; +FASTFLOAT_SIMD_RESTORE_WARNINGS +#endif +} + + + typedef span byte_span; template @@ -203,8 +251,7 @@ parsed_number_string parse_number_string(const CharT *p, const CharT *pen const CharT* before = p; // can occur at most twice without overflowing, but let it occur more, since // for integers with many digits, digit parsing is the primary bottleneck. - while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) { - i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok + while ((std::distance(p, pend) >= 8) && parse_if_eight_digits_unrolled(p)) { // in rare cases, this will overflow, but that's ok p += 8; } while ((p != pend) && is_integer(*p)) {