From ba656ace7887df1176fdb9e58c33acc89bb9b62b Mon Sep 17 00:00:00 2001
From: IRainman <a.rainman@gmail.com>
Date: Tue, 30 Dec 2025 20:15:17 +0300
Subject: [PATCH] improvements in the code generation for byteswap and
 leading_zero to all compilers.

---
 include/fast_float/ascii_number.h | 13 ++++++++-----
 include/fast_float/float_common.h | 16 ++++++++++++++--
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
index 41cf977..80d4565 100644
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@@ -34,16 +34,19 @@ fastfloat_really_inline constexpr bool is_integer(UC c) noexcept {
   return !(c > UC('9') || c < UC('0'));
 }
 
-#if FASTFLOAT_HAS_BYTESWAP == 0
 fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) noexcept {
+#if FASTFLOAT_HAS_BYTESWAP == 1
+  return std::byteswap(val);
+#elif defined(__has_builtin) && __has_builtin(__builtin_bswap64)
+  return __builtin_bswap64(val);
+#elif defined(_MSC_VER)
+  return _byteswap_uint64(val);
+#else
   return (val & 0xFF00000000000000) >> 56 | (val & 0x00FF000000000000) >> 40 |
          (val & 0x0000FF0000000000) >> 24 | (val & 0x000000FF00000000) >> 8 |
          (val & 0x00000000FF000000) << 8 | (val & 0x0000000000FF0000) << 24 |
          (val & 0x000000000000FF00) << 40 | (val & 0x00000000000000FF) << 56;
-}
-#elif FASTFLOAT_HAS_BYTESWAP == 1
-fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) noexcept {
-  return std::byteswap(val);
+#endif
 }
 #endif
 
diff --git a/include/fast_float/float_common.h b/include/fast_float/float_common.h
index 7191b04..079eb6f 100644
--- a/include/fast_float/float_common.h
+++ b/include/fast_float/float_common.h
@@ -411,7 +411,11 @@ leading_zeroes(uint64_t input_num) noexcept {
     return leading_zeroes_generic(input_num);
   }
 #ifdef FASTFLOAT_VISUAL_STUDIO
-#if defined(_M_X64) || defined(_M_ARM64)
+#if defined(__AVX2__)
+  // use lzcnt on MSVC only on AVX2 capable CPU's that all have this BMI
+  // instruction
+  return __lzcnt64(x);
+#elif defined(_M_X64) || defined(_M_ARM64)
   unsigned long leading_zero;
   // Search the mask data from most significant bit (MSB)
   // to least significant bit (LSB) for a set bit (1).
@@ -420,8 +424,16 @@ leading_zeroes(uint64_t input_num) noexcept {
 #else
   return static_cast<limb_t>(leading_zeroes_generic(input_num));
 #endif
+#elif __has_builtin(__builtin_clzll)
+  return static_cast<limb_t>(__builtin_clzll(x));
 #else
-  return static_cast<limb_t>(__builtin_clzll(input_num));
+  // Unlike MSVC, clang and gcc recognize this implementation and replace
+  // it with the assembly instructions which are appropriate for the
+  // target (lzcnt or bsr + zero handling).
+  int n = 64;
+  for (; leading_zero > 0; leading_zero >>= 1)
+    --n;
+  return static_cast<limb_t>(n);
 #endif
 }