From 38613a39f9eb3763a68fe56f8acf22474ed9c5ed Mon Sep 17 00:00:00 2001
From: Maya Warrier <maya.warrierm@gmail.com>
Date: Wed, 17 May 2023 01:34:33 -0400
Subject: [PATCH] Fix perf decrease when UC = char

---
 include/fast_float/ascii_number.h | 80 +++++++++++++++----------------
 include/fast_float/float_common.h |  2 +-
 2 files changed, 39 insertions(+), 43 deletions(-)
diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
index 39f2a07..cc0af11 100644
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@@ -17,7 +17,7 @@
 namespace fast_float {
 
 template <typename UC>
-fastfloat_really_inline constexpr bool has_simd_opts() {
+fastfloat_really_inline constexpr bool has_simd_opt() {
 #ifdef FASTFLOAT_HAS_SIMD
   return std::is_same<UC, char16_t>::value;
 #else
@@ -68,18 +68,7 @@ uint64_t read8_to_u64(const UC *chars) {
 
 fastfloat_really_inline
 uint64_t simd_read8_to_u64(const __m128i data) {
-FASTFLOAT_SIMD_DISABLE_WARNINGS
-  static const char16_t kmasks[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
-  const __m128i masks = _mm_loadu_si128(reinterpret_cast<const __m128i*>(kmasks));
-
-  // todo: with AVX512BW + AVX512VL, can use cvtepi16_epi8 instead of masking + pack
-  __m128i masked = _mm_and_si128(data, masks);
-  __m128i packed = _mm_packus_epi16(masked, masked);
-
-  uint64_t val;
-  _mm_storeu_si64(&val, packed);
-  return val;
-FASTFLOAT_SIMD_RESTORE_WARNINGS
+  return _mm_cvtsi128_si64x(_mm_packus_epi16(data, data));
 }
 
 fastfloat_really_inline
@@ -92,7 +81,7 @@ FASTFLOAT_SIMD_RESTORE_WARNINGS
 #endif
 
 // dummy for compile
-template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opts<UC>())>
+template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>())>
 uint64_t simd_read8_to_u64(UC const*) {
   return 0;
 }
@@ -132,7 +121,7 @@ uint32_t parse_eight_digits_unrolled(uint64_t val) {
 template <typename UC>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
 uint32_t parse_eight_digits_unrolled(UC const * chars)  noexcept {
-  if (cpp20_and_in_constexpr() || !has_simd_opts<UC>()) {
+  if (cpp20_and_in_constexpr() || !has_simd_opt<UC>()) {
     return parse_eight_digits_unrolled(read8_to_u64(chars)); // truncation okay
   }
   return parse_eight_digits_unrolled(simd_read8_to_u64(chars));
@@ -145,28 +134,18 @@ fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val
      0x8080808080808080));
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool parse_if_eight_digits_unrolled(const char* chars, uint64_t& i) noexcept {
-  if (is_made_of_eight_digits_fast(read8_to_u64(chars))) {
-    i = i * 100000000 + parse_eight_digits_unrolled(read8_to_u64(chars));
-    return true;
-  }
-  else return false;
-}
+
+#ifdef FASTFLOAT_HAS_SIMD
 
 // Call this if chars might not be 8 digits.
 // Using this style (instead of is_made_of_eight_digits_fast() then parse_eight_digits_unrolled())
-// ensures we don't load SIMD registers twice if we don't have to.
-//
-// Benchmark:
-// https://quick-bench.com/q/Bbn0B4WmZsdgS3qDZWpggAY-jgs
-//
+// ensures we don't load SIMD registers twice.
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool parse_if_eight_digits_unrolled(const char16_t* chars, uint64_t& i) noexcept {
-#ifdef FASTFLOAT_SSE2
+bool simd_parse_if_eight_digits_unrolled(const char16_t* chars, uint64_t& i) noexcept {
   if (cpp20_and_in_constexpr()) {
     return false;
-  }    
+  }   
+#ifdef FASTFLOAT_SSE2
 FASTFLOAT_SIMD_DISABLE_WARNINGS
   const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(chars));
 
@@ -181,18 +160,36 @@ FASTFLOAT_SIMD_DISABLE_WARNINGS
   }
   else return false;
 FASTFLOAT_SIMD_RESTORE_WARNINGS
-
-#else // No SIMD available
-
-  (void)chars; (void)i; // unused
-  return false;
 #endif
 }
 
-// todo, no simd optimization yet
+#endif
+
+// dummy for compile
+template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>())>
+uint64_t simd_parse_if_eight_digits_unrolled(UC const*, uint64_t&) {
+  return 0;
+}
+
+
+template <typename UC, FASTFLOAT_ENABLE_IF(!std::is_same<UC, char>::value)>
 fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool parse_if_eight_digits_unrolled(const char32_t*, uint64_t&) noexcept {
-  return false;
+void loop_parse_if_eight_digits(const UC*& p, const UC* const pend, uint64_t& i) {
+  if (!has_simd_opt<UC>()) {
+    return;
+  }
+  while ((std::distance(p, pend) >= 8) && simd_parse_if_eight_digits_unrolled(p, i)) { // in rare cases, this will overflow, but that's ok
+    p += 8;
+  }
+}
+
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20
+void loop_parse_if_eight_digits(const char*& p, const char* const pend, uint64_t& i) {
+  // optimizes better than parse_if_eight_digits_unrolled() for UC = char.
+  while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(read8_to_u64(p))) {
+    i = i * 100000000 + parse_eight_digits_unrolled(read8_to_u64(p)); // in rare cases, this will overflow, but that's ok
+    p += 8;
+  }
 }
 
 template <typename UC>
@@ -256,9 +253,8 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
     UC const * before = p;
     // can occur at most twice without overflowing, but let it occur more, since
     // for integers with many digits, digit parsing is the primary bottleneck.
-    while ((std::distance(p, pend) >= 8) && parse_if_eight_digits_unrolled(p, i)) {  // in rare cases, this will overflow, but that's ok
-      p += 8;
-    }
+    loop_parse_if_eight_digits(p, pend, i);
+
     while ((p != pend) && is_integer(*p)) {
       uint8_t digit = uint8_t(*p - UC('0'));
       ++p;
diff --git a/include/fast_float/float_common.h b/include/fast_float/float_common.h
index 201e72f..80b022e 100644
--- a/include/fast_float/float_common.h
+++ b/include/fast_float/float_common.h
@@ -157,7 +157,7 @@ using parse_options = parse_options_t<char>;
 // rust style `try!()` macro, or `?` operator
 #define FASTFLOAT_TRY(x) { if (!(x)) return false; }
 
-#define FASTFLOAT_ENABLE_IF(test) typename std::enable_if<(test), int>::type = 0
+#define FASTFLOAT_ENABLE_IF(...) typename std::enable_if<(__VA_ARGS__), int>::type = 0
 
 
 namespace fast_float {