Merge pull request #28 from lemire/dlemire/aqrit_magic

Magical optimizations from @aqrit
2026-01-01 03:12:18 +08:00 · 2020-11-23 18:23:28 -05:00 · 2020-11-23 18:23:28 -05:00 · caade69916
commit caade69916
parent e78525d3e2 7ef9d9b7d2
4 changed files with 20 additions and 11 deletions
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@ -15,22 +15,29 @@ namespace fast_float {
 fastfloat_really_inline bool is_integer(char c)  noexcept  { return c >= '0' && c <= '9'; }


-// credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
+// credit  @aqrit
+fastfloat_really_inline uint32_t  parse_eight_digits_unrolled(uint64_t val) {
+  const uint64_t mask = 0x000000FF000000FF;
+  const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
+  const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
+  val -= 0x3030303030303030;
+  val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
+  val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
+  return uint32_t(val);
+}
+
 fastfloat_really_inline uint32_t parse_eight_digits_unrolled(const char *chars)  noexcept  {
  uint64_t val;
  ::memcpy(&val, chars, sizeof(uint64_t));
-  val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
-  val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
-  return uint32_t((val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32);
+  return parse_eight_digits_unrolled(val);
 }

+// credit @aqrit
 fastfloat_really_inline bool is_made_of_eight_digits_fast(uint64_t val)  noexcept  {
-  return (((val & 0xF0F0F0F0F0F0F0F0) |
-           (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
-          0x3333333333333333);
+  return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) &
+     0x8080808080808080)); 
 }

-
 fastfloat_really_inline bool is_made_of_eight_digits_fast(const char *chars)  noexcept  {
  uint64_t val;
  ::memcpy(&val, chars, 8);
--- a/include/fast_float/parse_number.h
+++ b/include/fast_float/parse_number.h
@ -91,7 +91,7 @@ from_chars_result from_chars(const char *first, const char *last,
  }
  answer.ec = std::errc(); // be optimistic
  answer.ptr = pns.lastmatch;
-
+  // Next is Clinger's fast path.
  if (binary_format<T>::min_exponent_fast_path() <= pns.exponent && pns.exponent <= binary_format<T>::max_exponent_fast_path() && pns.mantissa <=binary_format<T>::max_mantissa_fast_path()) {
    value = T(pns.mantissa);
    if (pns.exponent < 0) { value = value / binary_format<T>::exact_power_of_ten(-pns.exponent); }
--- a/include/fast_float/simple_decimal_conversion.h
+++ b/include/fast_float/simple_decimal_conversion.h
@ -360,6 +360,8 @@ adjusted_mantissa parse_long_mantissa(const char *first, const char* last) {
    // credit: R. Oudompheng who first implemented this fast path (to my knowledge).
    // It is rough, but it does the job of accelerating the slow path since most
    // long streams of digits are determined after 19 digits.
+    // Note that mantissa+1 cannot overflow since mantissa < 10**19 and so
+    // mantissa+1 <= 10**19 < 2**64.
    adjusted_mantissa am1 = compute_float<binary>(exponent, mantissa);
    adjusted_mantissa am2 = compute_float<binary>(exponent, mantissa+1);
    // They must both agree and be both a successful result.
--- a/script/table_generation.py
+++ b/script/table_generation.py
@ -18,8 +18,8 @@ for q in range(-342,0):
        # truncate
        while(c >= (1<<128)):
          c //= 2
-        format(c)    
-    
+        format(c)
+
 for q in range(0,308+1):
    power5 = 5 ** q
    # move the most significant bit in position