improvements in the code generation for byteswap and leading_zero to all compilers.

2026-01-01 03:12:18 +08:00 · 2025-12-30 20:15:17 +03:00 · 2025-12-30 20:15:17 +03:00 · ba656ace78
commit ba656ace78
parent 49106981a3
2 changed files with 22 additions and 7 deletions
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@ -34,16 +34,19 @@ fastfloat_really_inline constexpr bool is_integer(UC c) noexcept {
  return !(c > UC('9') || c < UC('0'));
 }

-#if FASTFLOAT_HAS_BYTESWAP == 0
 fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) noexcept {
+#if FASTFLOAT_HAS_BYTESWAP == 1
+  return std::byteswap(val);
+#elif defined(__has_builtin) && __has_builtin(__builtin_bswap64)
+  return __builtin_bswap64(val);
+#elif defined(_MSC_VER)
+  return _byteswap_uint64(val);
+#else
  return (val & 0xFF00000000000000) >> 56 | (val & 0x00FF000000000000) >> 40 |
         (val & 0x0000FF0000000000) >> 24 | (val & 0x000000FF00000000) >> 8 |
         (val & 0x00000000FF000000) << 8 | (val & 0x0000000000FF0000) << 24 |
         (val & 0x000000000000FF00) << 40 | (val & 0x00000000000000FF) << 56;
-}
-#elif FASTFLOAT_HAS_BYTESWAP == 1
-fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) noexcept {
-  return std::byteswap(val);
+#endif
 }
 #endif

--- a/include/fast_float/float_common.h
+++ b/include/fast_float/float_common.h
@ -411,7 +411,11 @@ leading_zeroes(uint64_t input_num) noexcept {
    return leading_zeroes_generic(input_num);
  }
 #ifdef FASTFLOAT_VISUAL_STUDIO
-#if defined(_M_X64) || defined(_M_ARM64)
+#if defined(__AVX2__)
+  // use lzcnt on MSVC only on AVX2 capable CPU's that all have this BMI
+  // instruction
+  return __lzcnt64(x);
+#elif defined(_M_X64) || defined(_M_ARM64)
  unsigned long leading_zero;
  // Search the mask data from most significant bit (MSB)
  // to least significant bit (LSB) for a set bit (1).
@ -420,8 +424,16 @@ leading_zeroes(uint64_t input_num) noexcept {
 #else
  return static_cast<limb_t>(leading_zeroes_generic(input_num));
 #endif
+#elif __has_builtin(__builtin_clzll)
+  return static_cast<limb_t>(__builtin_clzll(x));
 #else
-  return static_cast<limb_t>(__builtin_clzll(input_num));
+  // Unlike MSVC, clang and gcc recognize this implementation and replace
+  // it with the assembly instructions which are appropriate for the
+  // target (lzcnt or bsr + zero handling).
+  int n = 64;
+  for (; leading_zero > 0; leading_zero >>= 1)
+    --n;
+  return static_cast<limb_t>(n);
 #endif
 }