Add basic support for char32_t (unoptimized)

2026-01-01 03:12:18 +08:00 · 2023-04-30 02:20:24 -04:00 · 2023-04-30 02:20:24 -04:00 · 091458d192
commit 091458d192
parent 65bd922e38
2 changed files with 81 additions and 87 deletions
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@ -34,49 +34,47 @@ fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) {
    | (val & 0x00000000000000FF) << 56;
 }

+
+#ifdef FASTFLOAT_SSE2
+
 fastfloat_really_inline
-uint64_t fast_read_u64(const char* chars) {
-  uint64_t val;
-  ::memcpy(&val, chars, sizeof(uint64_t));
-  return val;
+__m128i load_packus_masks_c16(void) noexcept {
+FASTFLOAT_SIMD_DISABLE_WARNINGS
+  static const char16_t masks[] = { 0xff, 0xff, 0xff, 0xff };
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(masks));
+FASTFLOAT_SIMD_RESTORE_WARNINGS
 }

-// https://quick-bench.com/q/fk6Y07KDGu8XZ9iUtQD8QJTc3Hg
-// todo: add support for char32_t
+// packus_masks is an argument only so its value may be preloaded.
+// it should always come from load_packus_masks_c16().
 fastfloat_really_inline
-uint64_t fast_read_u64(const char16_t* chars) {
-#ifdef FASTFLOAT_SSE2
+uint64_t simd_read8_to_u64(const char16_t* chars, const __m128i packus_masks) {
 FASTFLOAT_SIMD_DISABLE_WARNINGS
-  static const char16_t masks[] = {0xff, 0xff, 0xff, 0xff};
-  const __m128i m_masks = _mm_loadu_si128(reinterpret_cast<const __m128i*>(masks));
-
-  // mask hi bytes and pack
-  const char* const p = reinterpret_cast<const char*>(chars);
-  __m128i i1 = _mm_and_si128(_mm_loadu_si64(p), m_masks);
-  __m128i i2 = _mm_and_si128(_mm_loadu_si64(p + 8), m_masks);
+  // process 4 and 4 chars simultaneously (loadu_si64 has high latency)
+  // with AVX512BW + AVX512VL, masking is not required as we have cvtepi16_epi8
+  const char* const p = reinterpret_cast<const char*>(chars); 
+  __m128i i1 = _mm_and_si128(_mm_loadu_si64(p), packus_masks);
+  __m128i i2 = _mm_and_si128(_mm_loadu_si64(p + 8), packus_masks);
  __m128i packed = _mm_packus_epi16(i1, i2);

-  // extract
  uint64_t val;
  _mm_storeu_si64(&val, _mm_shuffle_epi32(packed, 0x8));
  return val;
 FASTFLOAT_SIMD_RESTORE_WARNINGS
-#else
-  unsigned char bytes[8];
-  for (int i = 0; i < 8; ++i)
-      bytes[i] = (unsigned char)chars[i];
-
-  // bit-cast
-  uint64_t val;
-  ::memcpy(&val, bytes, sizeof(uint64_t));
-  return val;
-#endif
 }

+// https://quick-bench.com/q/fk6Y07KDGu8XZ9iUtQD8QJTc3Hg
+fastfloat_really_inline
+uint64_t simd_read8_to_u64(const char16_t* chars) {
+  return simd_read8_to_u64(chars, load_packus_masks_c16());
+}
+#endif
+
+// Read 8 CharT into a u64. Truncates CharT if != char.
 template <typename CharT>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint64_t read_u64(const CharT *chars) {
-  if (cpp20_and_in_constexpr()) {
+uint64_t read8_to_u64(const CharT *chars) {
+  if (cpp20_and_in_constexpr() || !std::is_same<CharT, char>::value) {
    uint64_t val = 0;
    for(int i = 0; i < 8; ++i) {
      val |= uint64_t(char(*chars)) << (i*8);
@ -84,7 +82,8 @@ uint64_t read_u64(const CharT *chars) {
    }
    return val;
  }
-  uint64_t val = fast_read_u64(chars);
+  uint64_t val;
+  ::memcpy(&val, chars, sizeof(uint64_t));
 #if FASTFLOAT_IS_BIG_ENDIAN == 1
  // Need to read as-if the number was in little-endian order.
  val = byteswap(val);
@ -121,92 +120,87 @@ uint32_t parse_eight_digits_unrolled(uint64_t val) {
  return uint32_t(val);
 }

-// http://0x80.pl/articles/simd-parsing-int-sequences.html
-#ifdef FASTFLOAT_SSE2
-fastfloat_really_inline
-uint32_t parse_eight_digits_unrolled_c16(const __m128i val) {
-  // x - '0'
-  const __m128i s1digits16 = _mm_sub_epi16(val, _mm_set1_epi16('0'));
-  // 10 * x(b) + x(b-1) -> 2 digit numbers
-  const __m128i s2digits32 = _mm_madd_epi16(s1digits16, _mm_setr_epi16(10, 1, 10, 1, 10, 1, 10, 1));
-  const __m128i s2digits16 = _mm_packus_epi16(s2digits32, s2digits32);
-  // 100 * x(b) + x(b-1) -> 4 digit numbers
-  const __m128i s4digits32 = _mm_madd_epi16(s2digits16, _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1));
-  const __m128i s4digits16 = _mm_packus_epi16(s4digits32, s4digits32);
-  // 10000 * x(b) + x(b-1) -> 8 digit number
-  const __m128i s8digits32 = _mm_madd_epi16(s4digits16, _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1));
-
-  uint32_t value;
-  _mm_storeu_si32(&value, s8digits32);
-  return value;
-}
-#endif
-
-// credit @aqrit
-fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val)  noexcept  {
-  return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) &
-     0x8080808080808080));
-}

+// Call this if chars are definitely 8 digits.
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
 uint32_t parse_eight_digits_unrolled(const char* chars)  noexcept {
-    return parse_eight_digits_unrolled(read_u64(chars));
+    return parse_eight_digits_unrolled(read8_to_u64(chars));
 }

-// Call this if you know chars are only digits
-//todo: add support for char32_t
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
 uint32_t parse_eight_digits_unrolled(const char16_t* chars)  noexcept {
  if (cpp20_and_in_constexpr() || !has_simd()) {
-    return parse_eight_digits_unrolled(read_u64(chars));
+    return parse_eight_digits_unrolled(read8_to_u64(chars));
  }
-#ifndef FASTFLOAT_HAS_SIMD
-  return 0; // never reaches here, remove warning
+#ifdef FASTFLOAT_HAS_SIMD
+  return parse_eight_digits_unrolled(simd_read8_to_u64(chars));
 #else
-FASTFLOAT_SIMD_DISABLE_WARNINGS
-  return parse_eight_digits_unrolled_c16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(chars)));
-FASTFLOAT_SIMD_RESTORE_WARNINGS
+  // never reaches here, remove warning
+  return 0;
 #endif
 }

+// todo, no simd optimization yet
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool parse_if_eight_digits_unrolled(const char* chars, std::uint64_t& i) noexcept {
-    const bool all = is_made_of_eight_digits_fast(read_u64(chars));
-    if (all) i = i * 100000000 + parse_eight_digits_unrolled(read_u64(chars));
-    return all;
+uint32_t parse_eight_digits_unrolled(const char32_t* chars)  noexcept {
+  return parse_eight_digits_unrolled(read8_to_u64(chars));
+}
+
+
+// credit @aqrit
+fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val)  noexcept {
+  return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) &
+    0x8080808080808080));
 }

-// Call this if you don't know whether chars are only digits
-// http://0x80.pl/articles/simd-parsing-int-sequences.html
-//todo: add support for char32_t
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool parse_if_eight_digits_unrolled(const char16_t* chars, std::uint64_t& i) noexcept {
-  if (cpp20_and_in_constexpr() || !has_simd()) {
-    for (int i = 0; i < 8; ++i) {
-      if (chars[i] < u'0' || chars[i] > u'9')
-        return false;
-    }
-    i = i * 100000000 + parse_eight_digits_unrolled(read_u64(chars));
-    return true;
+bool parse_if_eight_digits_unrolled(const char* chars, uint64_t& i) noexcept {
+  const bool is_digits = is_made_of_eight_digits_fast(read8_to_u64(chars));
+  if (is_digits) {
+    i = i * 100000000 + parse_eight_digits_unrolled(read8_to_u64(chars));
  }
-#ifndef FASTFLOAT_HAS_SIMD
-  return false; // never reaches here, remove warning
-#else
+  return is_digits;
+}
+
+// Call this if chars might not be 8 digits.
+// Using this (instead of is_made_of_eight_digits_fast() then parse_eight_digits_unrolled())
+// ensures we don't load SIMD registers twice.
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
+bool parse_if_eight_digits_unrolled(const char16_t* chars, uint64_t& i) noexcept {
+#ifdef FASTFLOAT_SSE2
+  if (cpp20_and_in_constexpr()) {
+    return false;
+  }    
 FASTFLOAT_SIMD_DISABLE_WARNINGS
  const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(chars));
+  const __m128i packus_masks = load_packus_masks_c16(); // be optimistic, preload
+
  // (x - '0') <= 9
+  // http://0x80.pl/articles/simd-parsing-int-sequences.html
  const __m128i t0 = _mm_sub_epi16(data, _mm_set1_epi16(80));
  const __m128i t1 = _mm_cmpgt_epi16(t0, _mm_set1_epi16(-119));

  if (_mm_movemask_epi8(t1) == 0) {
-    i = i * 100000000 + parse_eight_digits_unrolled_c16(data);
+    uint64_t digits = simd_read8_to_u64(chars, packus_masks);
+    i = i * 100000000 + parse_eight_digits_unrolled(digits);
    return true;
  }
  else return false;
 FASTFLOAT_SIMD_RESTORE_WARNINGS
+
+#else // No SIMD available
+  return false;
 #endif
 }

+// todo, no simd optimization yet
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
+bool parse_if_eight_digits_unrolled(const char32_t*, uint64_t&) noexcept {
+  return false;
+}
+
+
+
 typedef span<const char> byte_span;

 template <typename CharT>
--- a/include/fast_float/digit_comparison.h
+++ b/include/fast_float/digit_comparison.h
@ -158,10 +158,10 @@ void round_down(adjusted_mantissa& am, int32_t shift) noexcept {
 template <typename CharT>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
 void skip_zeros(const CharT*& first, const CharT* last) noexcept {
-  if (std::is_same<CharT, char>::value || has_simd()) {
+  if (std::is_same<CharT, char>::value) {
    uint64_t val;
    while (!cpp20_and_in_constexpr() && std::distance(first, last) >= 8) {
-      val = fast_read_u64(first);
+      ::memcpy(&val, first, sizeof(uint64_t));
      if (val != 0x3030303030303030) {
        break;
      }
@ -181,11 +181,11 @@ void skip_zeros(const CharT*& first, const CharT* last) noexcept {
 template <typename CharT>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
 bool is_truncated(const CharT* first, const CharT* last) noexcept {
-  if (std::is_same<CharT, char>::value || has_simd()) {
+  if (std::is_same<CharT, char>::value) {
    // do 8-bit optimizations, can just compare to 8 literal 0s.
    uint64_t val;
    while (!cpp20_and_in_constexpr() && std::distance(first, last) >= 8) {
-      val = fast_read_u64(first);
+      ::memcpy(&val, first, sizeof(uint64_t));
      if (val != 0x3030303030303030) {
        return true;
      }