Automatically detect SSE2

2025-12-07 01:06:48 +08:00 · 2023-05-02 13:05:57 -04:00 · 2023-05-02 13:05:57 -04:00 · 4cb09b5f59
commit 4cb09b5f59
parent c811b027ea
2 changed files with 10 additions and 4 deletions
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@ -73,7 +73,7 @@ FASTFLOAT_SIMD_DISABLE_WARNINGS
  const __m128i masks = _mm_loadu_si128(reinterpret_cast<const __m128i*>(kmasks));

  // pipeline 4 and 4 chars at the same time (since loadu_si64 has high latency)
-  // todo: with AVX512BW + AVX512VL, can use cvtepi16_epi8 instead
+  // todo: with AVX512BW + AVX512VL, can use cvtepi16_epi8 instead of masking + pack
  const char* const p = reinterpret_cast<const char*>(chars); 
  __m128i i1 = _mm_and_si128(_mm_loadu_si64(p), masks);
  __m128i i2 = _mm_and_si128(_mm_loadu_si64(p + 8), masks);
@ -150,8 +150,8 @@ bool parse_if_eight_digits_unrolled(const char* chars, uint64_t& i) noexcept {
 }

 // Call this if chars might not be 8 digits.
-// Using this (instead of is_made_of_eight_digits_fast() and parse_eight_digits_unrolled())
-// ensures we don't load SIMD registers twice.
+// Using this style (instead of is_made_of_eight_digits_fast() then parse_eight_digits_unrolled())
+// ensures we don't load SIMD registers twice if we don't have to.
 //
 // Benchmark:
 // https://quick-bench.com/q/Bbn0B4WmZsdgS3qDZWpggAY-jgs
--- a/include/fast_float/float_common.h
+++ b/include/fast_float/float_common.h
@ -78,8 +78,14 @@
 #endif
 #endif

+#if defined(__SSE2__) || \
+  (defined(FASTFLOAT_VISUAL_STUDIO) && \
+    (defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)))
+#define FASTFLOAT_SSE2 1
+#endif
+
 #ifdef FASTFLOAT_SSE2
-#define FASTFLOAT_HAS_SIMD (1)
+#define FASTFLOAT_HAS_SIMD 1
 #endif

 #if defined(__GNUC__)