Merge pull request #359 from shikharish/uint16

optimize uint16 parsing
2026-02-05 17:29:57 +08:00 · 2026-01-13 12:52:22 +01:00 · 2026-01-13 12:52:22 +01:00 · fd9cad9f0c
commit fd9cad9f0c
parent 42ae960d95 b14e6a466a
3 changed files with 227 additions and 1 deletions
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@ -11,7 +11,9 @@ FetchContent_MakeAvailable(counters)
 add_executable(realbenchmark benchmark.cpp)
 target_link_libraries(realbenchmark PRIVATE counters::counters)
 add_executable(bench_ip bench_ip.cpp)
+add_executable(bench_uint16 bench_uint16.cpp)
 target_link_libraries(bench_ip PRIVATE counters::counters)
+target_link_libraries(bench_uint16 PRIVATE counters::counters)

 set_property(
    TARGET realbenchmark
@ -19,8 +21,12 @@ set_property(
 set_property(
    TARGET bench_ip
    PROPERTY CXX_STANDARD 17)
+set_property(
+    TARGET bench_uint16
+    PROPERTY CXX_STANDARD 17)
 target_link_libraries(realbenchmark PUBLIC fast_float)
 target_link_libraries(bench_ip PUBLIC fast_float)
+target_link_libraries(bench_uint16 PUBLIC fast_float)

 include(ExternalProject)

--- a/benchmarks/bench_uint16.cpp
+++ b/benchmarks/bench_uint16.cpp
@ -0,0 +1,139 @@
+#include "counters/bench.h"
+#include "fast_float/fast_float.h"
+#include <charconv>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <random>
+#include <atomic>
+#include <string>
+#include <vector>
+
+void pretty_print(size_t volume, size_t bytes, std::string name,
+                  counters::event_aggregate agg) {
+  if (agg.inner_count > 1) {
+    printf("# (inner count: %d)\n", agg.inner_count);
+  }
+  printf("%-40s : ", name.c_str());
+  printf(" %5.2f GB/s ", bytes / agg.fastest_elapsed_ns());
+  printf(" %5.1f Mip/s ", volume * 1000.0 / agg.fastest_elapsed_ns());
+  printf(" %5.2f ns/ip ", agg.fastest_elapsed_ns() / volume);
+  if (counters::event_collector().has_events()) {
+    printf(" %5.2f GHz ", agg.fastest_cycles() / agg.fastest_elapsed_ns());
+    printf(" %5.2f c/ip ", agg.fastest_cycles() / volume);
+    printf(" %5.2f i/ip ", agg.fastest_instructions() / volume);
+    printf(" %5.2f c/b ", agg.fastest_cycles() / bytes);
+    printf(" %5.2f i/b ", agg.fastest_instructions() / bytes);
+    printf(" %5.2f i/c ", agg.fastest_instructions() / agg.fastest_cycles());
+  }
+  printf("\n");
+}
+
+enum class parse_method { standard, fast_float };
+
+void validate(const std::string &buffer, const std::vector<uint16_t> &expected,
+              char delimiter) {
+  const char *p = buffer.data();
+  const char *pend = p + buffer.size();
+
+  for (size_t i = 0; i < expected.size(); i++) {
+    uint16_t val;
+    auto r = fast_float::from_chars(p, pend, val);
+    if (r.ec != std::errc() || val != expected[i]) {
+      printf("Validation failed at index %zu: expected %u, got %u\n", i,
+             expected[i], val);
+      std::abort();
+    }
+    p = r.ptr;
+    if (i + 1 < expected.size()) {
+      if (p >= pend || *p != delimiter) {
+        printf("Validation failed at index %zu: delimiter mismatch\n", i);
+        std::abort();
+      }
+      ++p;
+    }
+  }
+
+  if (p != pend) {
+    printf("Validation failed: trailing bytes remain\n");
+    std::abort();
+  }
+  printf("Validation passed!\n");
+}
+
+int main() {
+  constexpr size_t N = 500000;
+  constexpr char delimiter = ',';
+  std::mt19937 rng(1234);
+  std::uniform_int_distribution<int> dist(0, 65535);
+
+  std::vector<uint16_t> expected;
+  expected.reserve(N);
+
+  std::string buffer;
+  buffer.reserve(N * 6); // up to 5 digits + delimiter
+
+  for (size_t i = 0; i < N; ++i) {
+    uint16_t val = (uint16_t)dist(rng);
+    expected.push_back(val);
+    std::string s = std::to_string(val);
+    buffer.append(s);
+    if (i + 1 < N) {
+      buffer.push_back(delimiter);
+    }
+  }
+
+  size_t total_bytes = buffer.size();
+
+  validate(buffer, expected, delimiter);
+
+  volatile uint64_t sink = 0;
+
+  pretty_print(N, total_bytes, "parse_uint16_std_fromchars",
+               counters::bench([&]() {
+                 uint64_t sum = 0;
+                 const char *p = buffer.data();
+                 const char *pend = p + buffer.size();
+                 for (size_t i = 0; i < N; ++i) {
+                   uint16_t value = 0;
+                   auto r = std::from_chars(p, pend, value);
+                   if (r.ec != std::errc())
+                     std::abort();
+                   sum += value;
+                   p = r.ptr;
+                   if (i + 1 < N) {
+                     if (p >= pend || *p != delimiter)
+                       std::abort();
+                     ++p;
+                   }
+                 }
+                 if (p != pend)
+                   std::abort();
+                 sink += sum;
+               }));
+
+  pretty_print(N, total_bytes, "parse_uint16_fastfloat", counters::bench([&]() {
+                 uint64_t sum = 0;
+                 const char *p = buffer.data();
+                 const char *pend = p + buffer.size();
+                 for (size_t i = 0; i < N; ++i) {
+                   uint16_t value = 0;
+                   auto r = fast_float::from_chars(p, pend, value);
+                   if (r.ec != std::errc())
+                     std::abort();
+                   sum += value;
+                   p = r.ptr;
+                   if (i + 1 < N) {
+                     if (p >= pend || *p != delimiter)
+                       std::abort();
+                     ++p;
+                   }
+                 }
+                 if (p != pend)
+                   std::abort();
+                 sink += sum;
+               }));
+
+  return EXIT_SUCCESS;
+}
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@ -32,7 +32,7 @@ template <typename UC> fastfloat_really_inline constexpr bool has_simd_opt() {
 // able to optimize it well.
 template <typename UC>
 fastfloat_really_inline constexpr bool is_integer(UC c) noexcept {
-  return !(c > UC('9') || c < UC('0'));
+  return (unsigned)(c - UC('0')) <= 9u;
 }

 fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) {
@ -68,6 +68,25 @@ read8_to_u64(UC const *chars) {
  return val;
 }

+// Read 4 UC into a u32. Truncates UC if not char.
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint32_t
+read4_to_u32(UC const *chars) {
+  if (cpp20_and_in_constexpr() || !std::is_same<UC, char>::value) {
+    uint32_t val = 0;
+    for (int i = 0; i < 4; ++i) {
+      val |= uint32_t(uint8_t(*chars)) << (i * 8);
+      ++chars;
+    }
+    return val;
+  }
+  uint32_t val;
+  ::memcpy(&val, chars, sizeof(uint32_t));
+#if FASTFLOAT_IS_BIG_ENDIAN == 1
+  val = byteswap_32(val);
+#endif
+  return val;
+}
 #ifdef FASTFLOAT_SSE2

 fastfloat_really_inline uint64_t simd_read8_to_u64(__m128i const data) {
@ -149,6 +168,18 @@ is_made_of_eight_digits_fast(uint64_t val) noexcept {
            0x8080808080808080));
 }

+fastfloat_really_inline constexpr bool
+is_made_of_four_digits_fast(uint32_t val) noexcept {
+  return !((((val + 0x46464646) | (val - 0x30303030)) & 0x80808080));
+}
+
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint32_t
+parse_four_digits_unrolled(uint32_t val) noexcept {
+  val -= 0x30303030;
+  val = (val * 10) + (val >> 8);
+  return (((val & 0x00FF00FF) * 0x00640001) >> 16) & 0xFFFF;
+}
+
 #ifdef FASTFLOAT_HAS_SIMD

 // Call this if chars might not be 8 digits.
@ -606,6 +637,56 @@ parse_int_string(UC const *p, UC const *pend, T &value,
    }
  }

+  FASTFLOAT_IF_CONSTEXPR17((std::is_same<T, std::uint16_t>::value)) {
+    if (base == 10) {
+      const size_t len = size_t(pend - p);
+      if (len == 0) {
+        if (has_leading_zeros) {
+          value = 0;
+          answer.ec = std::errc();
+          answer.ptr = p;
+        } else {
+          answer.ec = std::errc::invalid_argument;
+          answer.ptr = first;
+        }
+        return answer;
+      }
+
+      if (len >= 4) {
+        uint32_t digits = read4_to_u32(p);
+        if (is_made_of_four_digits_fast(digits)) {
+          uint32_t v = parse_four_digits_unrolled(digits);
+          if (len >= 5 && is_integer(p[4])) {
+            v = v * 10 + uint32_t(p[4] - '0');
+            if (len >= 6 && is_integer(p[5])) {
+              answer.ec = std::errc::result_out_of_range;
+              const UC *q = p + 5;
+              while (q != pend && is_integer(*q)) {
+                q++;
+              }
+              answer.ptr = q;
+              return answer;
+            }
+            if (v > 65535) {
+              answer.ec = std::errc::result_out_of_range;
+              answer.ptr = p + 5;
+              return answer;
+            }
+            value = uint16_t(v);
+            answer.ec = std::errc();
+            answer.ptr = p + 5;
+            return answer;
+          }
+          // 4 digits
+          value = uint16_t(v);
+          answer.ec = std::errc();
+          answer.ptr = p + 4;
+          return answer;
+        }
+      }
+    }
+  }
+
  uint64_t i = 0;
  if (base == 10) {
    loop_parse_if_eight_digits(p, pend, i); // use SIMD if possible