SIMD optimization to parse 8 char16_t at a time

2025-12-07 01:06:48 +08:00 · 2023-04-01 22:43:00 -04:00 · 2023-04-01 22:43:00 -04:00 · 8a9a9d538a
commit 8a9a9d538a
parent f59f73c4da
1 changed files with 62 additions and 15 deletions
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@ -35,8 +35,7 @@ fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) {
 }

 fastfloat_really_inline
-uint64_t fast_read_u64(const char* chars)
-{
+uint64_t fast_read_u64(const char* chars) {
  uint64_t val;
  ::memcpy(&val, chars, sizeof(uint64_t));
  return val;
@ -44,8 +43,7 @@ uint64_t fast_read_u64(const char* chars)

 // https://quick-bench.com/q/fk6Y07KDGu8XZ9iUtQD8QJTc3Hg
 fastfloat_really_inline
-uint64_t fast_read_u64(const char16_t* chars)
-{
+uint64_t fast_read_u64(const char16_t* chars) {
 #if FASTFLOAT_SSE2
 FASTFLOAT_SIMD_DISABLE_WARNINGS
  static const char16_t masks[] = {0xff, 0xff, 0xff, 0xff};
@ -63,7 +61,7 @@ FASTFLOAT_SIMD_DISABLE_WARNINGS
  return val;
 FASTFLOAT_SIMD_RESTORE_WARNINGS
 #else
-  alignas(8) unsigned char bytes[8];
+  unsigned char bytes[8];
  for (int i = 0; i < 8; ++i)
      bytes[i] = (unsigned char)chars[i];

@ -122,24 +120,74 @@ uint32_t parse_eight_digits_unrolled(uint64_t val) {
  return uint32_t(val);
 }

-template <typename CharT>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint32_t parse_eight_digits_unrolled(const CharT *chars)  noexcept  {
-  return parse_eight_digits_unrolled(read_u64(chars));
-}
-
 // credit @aqrit
 fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val)  noexcept  {
  return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) &
     0x8080808080808080));
 }

-template <typename CharT>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool is_made_of_eight_digits_fast(const CharT *chars)  noexcept  {
+uint32_t parse_eight_digits_unrolled(const char* chars)  noexcept {
+    return parse_eight_digits_unrolled(read_u64(chars));
+}
+
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
+bool is_made_of_eight_digits_fast(const char *chars)  noexcept  {
  return is_made_of_eight_digits_fast(read_u64(chars));
 }

+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
+bool parse_if_eight_digits_unrolled(const char* chars, std::uint64_t& i) noexcept {
+    const bool all = is_made_of_eight_digits_fast(chars);
+    if (all) i = i * 100000000 * parse_eight_digits_unrolled(chars);
+    return all;
+}
+
+// http://0x80.pl/articles/simd-parsing-int-sequences.html
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
+bool parse_if_eight_digits_unrolled(const char16_t* chars, std::uint64_t& i) noexcept {
+  if (cpp20_and_in_constexpr() || !FASTFLOAT_SSE2) {
+    for (int i = 0; i < 8; ++i) {
+      if (chars[i] < u'0' || chars[i] > u'9')
+        return false;
+    }
+    i = i * 100000000 + parse_eight_digits_unrolled(read_u64(chars));
+    return true;
+  }
+#if FASTFLOAT_SSE2
+FASTFLOAT_SIMD_DISABLE_WARNINGS
+  const __m128i data = _mm_loadu_si128(reinterpret_cast<const char*>(chars));
+
+  // (x - '0') <= 9
+  const __m128i t0 = _mm_sub_epi16(data, _mm_set1_epi16(80));
+  const __m128i t1 = _mm_cmpgt_epi16(t0, _mm_set1_epi16(-119));
+  const bool is_digits = _mm_movemask_epi8(t1) == 0;
+
+  if (is_digits) {
+    // x - '0'
+    const __m128i s1digits16 = _mm_sub_epi16(data, _mm_set1_epi16('0'));
+    // 10 * x(b) + x(b-1) -> 2 digit numbers
+    const __m128i s2digits32 = _mm_madd_epi16(s1digits16, _mm_setr_epi16(10, 1, 10, 1, 10, 1, 10, 1);
+    const __m128i s2digits16 = _mm_packus_epi16(s2digits32, s2digits32);
+    // 100 * x(b) + x(b-1) -> 4 digit numbers
+    const __m128i s4digits32 = _mm_madd_epi16(s2digits16, _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1));
+    const __m128i s4digits16 = _mm_packus_epi16(s4digits32, s4digits32);
+    // 10000 * x(b) + x(b-1) -> 8 digit number
+    const __m128i s8digits32 = _mm_madd_epi16(s4digits16, _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1));
+
+    uint32_t value;
+    _mm_storeu_si32(&value, s8digits32);
+
+    i = i * 100000000 + value;
+    return true;
+  }
+  else return false;
+FASTFLOAT_SIMD_RESTORE_WARNINGS
+#endif
+}
+
+
+
 typedef span<const char> byte_span;

 template <typename CharT>
@ -203,8 +251,7 @@ parsed_number_string<CharT> parse_number_string(const CharT *p, const CharT *pen
    const CharT* before = p;
    // can occur at most twice without overflowing, but let it occur more, since
    // for integers with many digits, digit parsing is the primary bottleneck.
-    while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
-      i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
+    while ((std::distance(p, pend) >= 8) && parse_if_eight_digits_unrolled(p)) {  // in rare cases, this will overflow, but that's ok
      p += 8;
    }
    while ((p != pend) && is_integer(*p)) {