adding bench_ip

Co-authored-by: Shikhar <shikharish05@gmail.com>
2026-08-01 17:26:27 +08:00 · 2025-12-22 11:52:48 -05:00 · 2025-12-22 11:52:48 -05:00 · 0fa058eebb
commit 0fa058eebb
parent 157deaeba5
6 changed files with 204 additions and 1405 deletions
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@ -1,9 +1,27 @@
+include(FetchContent)
+
+FetchContent_Declare(
+  counters
+  GIT_REPOSITORY https://github.com/lemire/counters.git
+  GIT_TAG v1.0.1
+)
+
+FetchContent_MakeAvailable(counters)
+
 add_executable(realbenchmark benchmark.cpp)
+target_link_libraries(realbenchmark PRIVATE Counters::counters)
+add_executable(bench_ip bench_ip.cpp)
+target_link_libraries(bench_ip PRIVATE Counters::counters)
+
 set_property(
    TARGET realbenchmark
    PROPERTY CXX_STANDARD 17)
-
+set_property(
+    TARGET bench_ip
+    PROPERTY CXX_STANDARD 17)
 target_link_libraries(realbenchmark PUBLIC fast_float)
+target_link_libraries(bench_ip PUBLIC fast_float)
+
 include(ExternalProject)

 # Define the external project
--- a/benchmarks/apple_arm_events.h
+++ b/benchmarks/apple_arm_events.h
--- a/benchmarks/bench_ip.cpp
+++ b/benchmarks/bench_ip.cpp
@ -0,0 +1,183 @@
+#include "counters/event_counter.h"
+#include "fast_float/fast_float.h"
+#include <charconv>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <random>
+#include  <atomic>
+event_collector collector;
+
+template <class function_type> 
+event_aggregate bench(const function_type& function, size_t min_repeat = 10, size_t min_time_ns = 1000000000, size_t max_repeat = 1000000) {
+    event_aggregate aggregate{};
+    size_t N = min_repeat;
+    if(N == 0) { N = 1; }
+    for (size_t i = 0; i < N; i++) {
+      std::atomic_thread_fence(std::memory_order_acquire);
+      collector.start();
+      function();
+      std::atomic_thread_fence(std::memory_order_release);
+      event_count allocate_count = collector.end();
+      aggregate << allocate_count;
+      if((i+1 == N) && (aggregate.total_elapsed_ns() < min_time_ns) && (N<max_repeat)) {
+        N *= 10;
+      }
+    }
+    return aggregate;
+}
+
+void pretty_print(size_t volume, size_t bytes, std::string name,
+                  event_aggregate agg) {
+  printf("%-40s : ", name.c_str());
+  printf(" %5.2f GB/s ", bytes / agg.fastest_elapsed_ns());
+  printf(" %5.1f Ma/s ", volume * 1000.0 / agg.fastest_elapsed_ns());
+  printf(" %5.2f ns/d ", agg.fastest_elapsed_ns() / volume);
+  if (collector.has_events()) {
+    printf(" %5.2f GHz ", agg.fastest_cycles() / agg.fastest_elapsed_ns());
+    printf(" %5.2f c/d ", agg.fastest_cycles() / volume);
+    printf(" %5.2f i/d ", agg.fastest_instructions() / volume);
+    printf(" %5.2f c/b ", agg.fastest_cycles() / bytes);
+    printf(" %5.2f i/b ", agg.fastest_instructions() / bytes);
+    printf(" %5.2f i/c ", agg.fastest_instructions() / agg.fastest_cycles());
+  }
+  printf("\n");
+}
+
+int parse_u8_fastswar(const char *&p, const char *pend, uint8_t *out) {
+  if (p == pend)
+    return 0;
+  auto r = fast_float::from_chars(p, pend, *out);
+  if (r.ec == std::errc()) {
+    p = r.ptr;
+    return 1;
+  }
+  return 0;
+}
+
+static inline int parse_u8_fromchars(const char *&p, const char *pend,
+                                     uint8_t *out) {
+  if (p == pend)
+    return 0;
+  auto r = std::from_chars(p, pend, *out);
+  if (r.ec == std::errc()) {
+    p = r.ptr;
+    return 1;
+  }
+  return 0;
+}
+
+template <typename Parser>
+static inline int parse_ip_line(const char *&p, const char *pend, uint32_t &sum,
+                                Parser parse_uint8) {
+  uint8_t o = 0;
+  for (int i = 0; i < 4; ++i) {
+    if (!parse_uint8(p, pend, &o))
+      return 0;
+    sum += o;
+    if (i != 3) {
+      if (p == pend || *p != '.')
+        return 0;
+      ++p;
+    }
+  }
+  // consume optional '\r'
+  if (p != pend && *p == '\r')
+    ++p;
+  // expect '\n' or end
+  if (p != pend && *p == '\n')
+    ++p;
+  return 1;
+}
+
+static std::string make_ip_line(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
+  std::string s;
+  s.reserve(16);
+  s += std::to_string(a);
+  s += '.';
+  s += std::to_string(b);
+  s += '.';
+  s += std::to_string(c);
+  s += '.';
+  s += std::to_string(d);
+  s += '\n';
+  return s;
+}
+
+int main() {
+  constexpr size_t N = 500000;
+  std::mt19937 rng(1234);
+  std::uniform_int_distribution<int> dist(0, 255);
+
+  std::string buf;
+  buf.reserve(N * 16);
+
+  for (size_t i = 0; i < N; ++i) {
+    uint8_t a = (uint8_t)dist(rng);
+    uint8_t b = (uint8_t)dist(rng);
+    uint8_t c = (uint8_t)dist(rng);
+    uint8_t d = (uint8_t)dist(rng);
+    buf += make_ip_line(a, b, c, d);
+  }
+
+  // sentinel to allow 4-byte loads at end
+  buf.append(4, '\0');
+
+  const size_t bytes = buf.size() - 4; // exclude sentinel from throughput
+  const size_t volume = N;
+
+  // validate correctness
+  {
+    const char *start = buf.data();
+    const char *end = buf.data() + bytes;
+    const char *p = start;
+    const char *pend = end;
+    uint32_t sum = 0;
+    for (size_t i = 0; i < N; ++i) {
+      int ok = parse_ip_line(p, pend, sum, parse_u8_fromchars);
+      if (!ok) {
+        std::fprintf(stderr, "fromchars parse failed at line %zu\n", i);
+        std::abort();
+      }
+      p = start;
+      pend = end;
+      ok = parse_ip_line(p, pend, sum, parse_u8_fastswar);
+      if (!ok) {
+        std::fprintf(stderr, "fastswar parse failed at line %zu\n", i);
+        std::abort();
+      }
+    }
+  }
+
+  uint32_t sink = 0;
+
+  pretty_print(volume, bytes, "parse_ip_fromchars", bench([&]() {
+                 const char *p = buf.data();
+                 const char *pend = buf.data() + bytes;
+                 uint32_t sum = 0;
+                 int ok = 0;
+                 for (size_t i = 0; i < N; ++i) {
+                   ok = parse_ip_line(p, pend, sum, parse_u8_fromchars);
+                   if (!ok)
+                     std::abort();
+                 }
+                 sink += sum;
+               }));
+
+  pretty_print(volume, bytes, "parse_ip_fastswar", bench([&]() {
+                 const char *p = buf.data();
+                 const char *pend = buf.data() + bytes;
+                 uint32_t sum = 0;
+                 int ok = 0;
+                 for (size_t i = 0; i < N; ++i) {
+                   ok = parse_ip_line(p, pend, sum, parse_u8_fastswar);
+                   if (!ok)
+                     std::abort();
+                 }
+                 sink += sum;
+               }));
+
+  std::printf("sink=%u\n", sink);
+  return EXIT_SUCCESS;
+}
--- a/benchmarks/benchmark.cpp
+++ b/benchmarks/benchmark.cpp
@ -1,7 +1,7 @@
 #if defined(__linux__) || (__APPLE__ && __aarch64__)
 #define USING_COUNTERS
 #endif
-#include "event_counter.h"
+#include "counters/event_counter.h"
 #include <algorithm>
 #include "fast_float/fast_float.h"
 #include <chrono>
@ -102,7 +102,7 @@ void pretty_print(double volume, size_t number_of_floats, std::string name,
    branches_avg += branches;
    branches_min = branches_min < branches ? branches_min : branches;

-    double branch_misses = e.missed_branches();
+    double branch_misses = e.branch_misses();
    branch_misses_avg += branch_misses;
    branch_misses_min =
        branch_misses_min < branch_misses ? branch_misses_min : branch_misses;
--- a/benchmarks/event_counter.h
+++ b/benchmarks/event_counter.h
@ -1,181 +0,0 @@
-#ifndef __EVENT_COUNTER_H
-#define __EVENT_COUNTER_H
-
-#include <cctype>
-#ifndef _MSC_VER
-#include <dirent.h>
-#endif
-#include <cinttypes>
-
-#include <cstring>
-
-#include <chrono>
-#include <vector>
-
-#include "linux-perf-events.h"
-#ifdef __linux__
-#include <libgen.h>
-#endif
-
-#if (defined(__APPLE__) && __APPLE__) && (defined(__aarch64__) && __aarch64__)
-#include "apple_arm_events.h"
-#endif
-
-struct event_count {
-  std::chrono::duration<double> elapsed;
-  std::vector<unsigned long long> event_counts;
-
-  event_count() : elapsed(0), event_counts{0, 0, 0, 0, 0} {}
-
-  event_count(const std::chrono::duration<double> _elapsed,
-              const std::vector<unsigned long long> _event_counts)
-      : elapsed(_elapsed), event_counts(_event_counts) {}
-
-  event_count(const event_count &other)
-      : elapsed(other.elapsed), event_counts(other.event_counts) {}
-
-  // The types of counters (so we can read the getter more easily)
-  enum event_counter_types {
-    CPU_CYCLES = 0,
-    INSTRUCTIONS = 1,
-    BRANCHES = 2,
-    MISSED_BRANCHES = 3
-  };
-
-  double elapsed_sec() const {
-    return std::chrono::duration<double>(elapsed).count();
-  }
-
-  double elapsed_ns() const {
-    return std::chrono::duration<double, std::nano>(elapsed).count();
-  }
-
-  double cycles() const {
-    return static_cast<double>(event_counts[CPU_CYCLES]);
-  }
-
-  double instructions() const {
-    return static_cast<double>(event_counts[INSTRUCTIONS]);
-  }
-
-  double branches() const {
-    return static_cast<double>(event_counts[BRANCHES]);
-  }
-
-  double missed_branches() const {
-    return static_cast<double>(event_counts[MISSED_BRANCHES]);
-  }
-
-  event_count &operator=(const event_count &other) {
-    this->elapsed = other.elapsed;
-    this->event_counts = other.event_counts;
-    return *this;
-  }
-
-  event_count operator+(const event_count &other) const {
-    return event_count(elapsed + other.elapsed,
-                       {
-                           event_counts[0] + other.event_counts[0],
-                           event_counts[1] + other.event_counts[1],
-                           event_counts[2] + other.event_counts[2],
-                           event_counts[3] + other.event_counts[3],
-                           event_counts[4] + other.event_counts[4],
-                       });
-  }
-
-  void operator+=(const event_count &other) { *this = *this + other; }
-};
-
-struct event_aggregate {
-  bool has_events = false;
-  int iterations = 0;
-  event_count total{};
-  event_count best{};
-  event_count worst{};
-
-  event_aggregate() = default;
-
-  void operator<<(const event_count &other) {
-    if (iterations == 0 || other.elapsed < best.elapsed) {
-      best = other;
-    }
-    if (iterations == 0 || other.elapsed > worst.elapsed) {
-      worst = other;
-    }
-    iterations++;
-    total += other;
-  }
-
-  double elapsed_sec() const { return total.elapsed_sec() / iterations; }
-
-  double elapsed_ns() const { return total.elapsed_ns() / iterations; }
-
-  double cycles() const { return total.cycles() / iterations; }
-
-  double instructions() const { return total.instructions() / iterations; }
-
-  double branches() const { return total.branches() / iterations; }
-
-  double missed_branches() const {
-    return total.missed_branches() / iterations;
-  }
-};
-
-struct event_collector {
-  event_count count{};
-  std::chrono::time_point<std::chrono::steady_clock> start_clock{};
-
-#if defined(__linux__)
-  LinuxEvents<PERF_TYPE_HARDWARE> linux_events;
-
-  event_collector()
-      : linux_events(std::vector<int>{
-            PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS,
-            PERF_COUNT_HW_BRANCH_INSTRUCTIONS, // Retired branch instructions
-            PERF_COUNT_HW_BRANCH_MISSES}) {}
-
-  bool has_events() { return linux_events.is_working(); }
-#elif __APPLE__ && __aarch64__
-  performance_counters diff;
-
-  event_collector() : diff(0) { setup_performance_counters(); }
-
-  bool has_events() { return setup_performance_counters(); }
-#else
-  event_collector() {}
-
-  bool has_events() { return false; }
-#endif
-
-  inline void start() {
-#if defined(__linux)
-    linux_events.start();
-#elif __APPLE__ && __aarch64__
-    if (has_events()) {
-      diff = get_counters();
-    }
-#endif
-    start_clock = std::chrono::steady_clock::now();
-  }
-
-  inline event_count &end() {
-    const auto end_clock = std::chrono::steady_clock::now();
-#if defined(__linux)
-    linux_events.end(count.event_counts);
-#elif __APPLE__ && __aarch64__
-    if (has_events()) {
-      performance_counters end = get_counters();
-      diff = end - diff;
-    }
-    count.event_counts[0] = diff.cycles;
-    count.event_counts[1] = diff.instructions;
-    count.event_counts[2] = diff.branches;
-    count.event_counts[3] = diff.missed_branches;
-    count.event_counts[4] = 0;
-#endif
-    count.elapsed = end_clock - start_clock;
-    return count;
-  }
-};
-
-#endif
--- a/benchmarks/linux-perf-events.h
+++ b/benchmarks/linux-perf-events.h
@ -1,104 +0,0 @@
-#pragma once
-#ifdef __linux__
-
-#include <asm/unistd.h>       // for __NR_perf_event_open
-#include <linux/perf_event.h> // for perf event constants
-#include <sys/ioctl.h>        // for ioctl
-#include <unistd.h>           // for syscall
-
-#include <cerrno>  // for errno
-#include <cstring> // for memset
-#include <stdexcept>
-
-#include <iostream>
-#include <vector>
-
-template <int TYPE = PERF_TYPE_HARDWARE> class LinuxEvents {
-  int fd;
-  bool working;
-  perf_event_attr attribs{};
-  size_t num_events{};
-  std::vector<uint64_t> temp_result_vec{};
-  std::vector<uint64_t> ids{};
-
-public:
-  explicit LinuxEvents(std::vector<int> config_vec) : fd(0), working(true) {
-    memset(&attribs, 0, sizeof(attribs));
-    attribs.type = TYPE;
-    attribs.size = sizeof(attribs);
-    attribs.disabled = 1;
-    attribs.exclude_kernel = 1;
-    attribs.exclude_hv = 1;
-
-    attribs.sample_period = 0;
-    attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
-    const int pid = 0;  // the current process
-    const int cpu = -1; // all CPUs
-    const unsigned long flags = 0;
-
-    int group = -1; // no group
-    num_events = config_vec.size();
-    ids.resize(config_vec.size());
-    uint32_t i = 0;
-    for (auto config : config_vec) {
-      attribs.config = config;
-      int _fd = static_cast<int>(
-          syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags));
-      if (_fd == -1) {
-        report_error("perf_event_open");
-      }
-      ioctl(_fd, PERF_EVENT_IOC_ID, &ids[i++]);
-      if (group == -1) {
-        group = _fd;
-        fd = _fd;
-      }
-    }
-
-    temp_result_vec.resize(num_events * 2 + 1);
-  }
-
-  ~LinuxEvents() {
-    if (fd != -1) {
-      close(fd);
-    }
-  }
-
-  inline void start() {
-    if (fd != -1) {
-      if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
-        report_error("ioctl(PERF_EVENT_IOC_RESET)");
-      }
-
-      if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
-        report_error("ioctl(PERF_EVENT_IOC_ENABLE)");
-      }
-    }
-  }
-
-  inline void end(std::vector<unsigned long long> &results) {
-    if (fd != -1) {
-      if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
-        report_error("ioctl(PERF_EVENT_IOC_DISABLE)");
-      }
-
-      if (read(fd, temp_result_vec.data(), temp_result_vec.size() * 8) == -1) {
-        report_error("read");
-      }
-    }
-    // our actual results are in slots 1,3,5, ... of this structure
-    for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) {
-      results[i / 2] = temp_result_vec[i];
-    }
-    for (uint32_t i = 2; i < temp_result_vec.size(); i += 2) {
-      if (ids[i / 2 - 1] != temp_result_vec[i]) {
-        report_error("event mismatch");
-      }
-    }
-  }
-
-  bool is_working() { return working; }
-
-private:
-  void report_error(const std::string &) { working = false; }
-};
-#endif