adding bench_ip

Co-authored-by: Shikhar <shikharish05@gmail.com>
This commit is contained in:
Daniel Lemire 2025-12-22 11:52:48 -05:00
parent 157deaeba5
commit 0fa058eebb
6 changed files with 204 additions and 1405 deletions

View File

@ -1,9 +1,27 @@
include(FetchContent)
FetchContent_Declare(
counters
GIT_REPOSITORY https://github.com/lemire/counters.git
GIT_TAG v1.0.1
)
FetchContent_MakeAvailable(counters)
add_executable(realbenchmark benchmark.cpp)
target_link_libraries(realbenchmark PRIVATE Counters::counters)
add_executable(bench_ip bench_ip.cpp)
target_link_libraries(bench_ip PRIVATE Counters::counters)
set_property(
TARGET realbenchmark
PROPERTY CXX_STANDARD 17)
set_property(
TARGET bench_ip
PROPERTY CXX_STANDARD 17)
target_link_libraries(realbenchmark PUBLIC fast_float)
target_link_libraries(bench_ip PUBLIC fast_float)
include(ExternalProject)
# Define the external project

File diff suppressed because it is too large Load Diff

183
benchmarks/bench_ip.cpp Normal file
View File

@ -0,0 +1,183 @@
#include "counters/event_counter.h"
#include "fast_float/fast_float.h"
#include <charconv>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <random>
#include <atomic>
event_collector collector;
template <class function_type>
event_aggregate bench(const function_type& function, size_t min_repeat = 10, size_t min_time_ns = 1000000000, size_t max_repeat = 1000000) {
event_aggregate aggregate{};
size_t N = min_repeat;
if(N == 0) { N = 1; }
for (size_t i = 0; i < N; i++) {
std::atomic_thread_fence(std::memory_order_acquire);
collector.start();
function();
std::atomic_thread_fence(std::memory_order_release);
event_count allocate_count = collector.end();
aggregate << allocate_count;
if((i+1 == N) && (aggregate.total_elapsed_ns() < min_time_ns) && (N<max_repeat)) {
N *= 10;
}
}
return aggregate;
}
void pretty_print(size_t volume, size_t bytes, std::string name,
event_aggregate agg) {
printf("%-40s : ", name.c_str());
printf(" %5.2f GB/s ", bytes / agg.fastest_elapsed_ns());
printf(" %5.1f Ma/s ", volume * 1000.0 / agg.fastest_elapsed_ns());
printf(" %5.2f ns/d ", agg.fastest_elapsed_ns() / volume);
if (collector.has_events()) {
printf(" %5.2f GHz ", agg.fastest_cycles() / agg.fastest_elapsed_ns());
printf(" %5.2f c/d ", agg.fastest_cycles() / volume);
printf(" %5.2f i/d ", agg.fastest_instructions() / volume);
printf(" %5.2f c/b ", agg.fastest_cycles() / bytes);
printf(" %5.2f i/b ", agg.fastest_instructions() / bytes);
printf(" %5.2f i/c ", agg.fastest_instructions() / agg.fastest_cycles());
}
printf("\n");
}
int parse_u8_fastswar(const char *&p, const char *pend, uint8_t *out) {
if (p == pend)
return 0;
auto r = fast_float::from_chars(p, pend, *out);
if (r.ec == std::errc()) {
p = r.ptr;
return 1;
}
return 0;
}
static inline int parse_u8_fromchars(const char *&p, const char *pend,
uint8_t *out) {
if (p == pend)
return 0;
auto r = std::from_chars(p, pend, *out);
if (r.ec == std::errc()) {
p = r.ptr;
return 1;
}
return 0;
}
template <typename Parser>
static inline int parse_ip_line(const char *&p, const char *pend, uint32_t &sum,
Parser parse_uint8) {
uint8_t o = 0;
for (int i = 0; i < 4; ++i) {
if (!parse_uint8(p, pend, &o))
return 0;
sum += o;
if (i != 3) {
if (p == pend || *p != '.')
return 0;
++p;
}
}
// consume optional '\r'
if (p != pend && *p == '\r')
++p;
// expect '\n' or end
if (p != pend && *p == '\n')
++p;
return 1;
}
static std::string make_ip_line(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
std::string s;
s.reserve(16);
s += std::to_string(a);
s += '.';
s += std::to_string(b);
s += '.';
s += std::to_string(c);
s += '.';
s += std::to_string(d);
s += '\n';
return s;
}
int main() {
constexpr size_t N = 500000;
std::mt19937 rng(1234);
std::uniform_int_distribution<int> dist(0, 255);
std::string buf;
buf.reserve(N * 16);
for (size_t i = 0; i < N; ++i) {
uint8_t a = (uint8_t)dist(rng);
uint8_t b = (uint8_t)dist(rng);
uint8_t c = (uint8_t)dist(rng);
uint8_t d = (uint8_t)dist(rng);
buf += make_ip_line(a, b, c, d);
}
// sentinel to allow 4-byte loads at end
buf.append(4, '\0');
const size_t bytes = buf.size() - 4; // exclude sentinel from throughput
const size_t volume = N;
// validate correctness
{
const char *start = buf.data();
const char *end = buf.data() + bytes;
const char *p = start;
const char *pend = end;
uint32_t sum = 0;
for (size_t i = 0; i < N; ++i) {
int ok = parse_ip_line(p, pend, sum, parse_u8_fromchars);
if (!ok) {
std::fprintf(stderr, "fromchars parse failed at line %zu\n", i);
std::abort();
}
p = start;
pend = end;
ok = parse_ip_line(p, pend, sum, parse_u8_fastswar);
if (!ok) {
std::fprintf(stderr, "fastswar parse failed at line %zu\n", i);
std::abort();
}
}
}
uint32_t sink = 0;
pretty_print(volume, bytes, "parse_ip_fromchars", bench([&]() {
const char *p = buf.data();
const char *pend = buf.data() + bytes;
uint32_t sum = 0;
int ok = 0;
for (size_t i = 0; i < N; ++i) {
ok = parse_ip_line(p, pend, sum, parse_u8_fromchars);
if (!ok)
std::abort();
}
sink += sum;
}));
pretty_print(volume, bytes, "parse_ip_fastswar", bench([&]() {
const char *p = buf.data();
const char *pend = buf.data() + bytes;
uint32_t sum = 0;
int ok = 0;
for (size_t i = 0; i < N; ++i) {
ok = parse_ip_line(p, pend, sum, parse_u8_fastswar);
if (!ok)
std::abort();
}
sink += sum;
}));
std::printf("sink=%u\n", sink);
return EXIT_SUCCESS;
}

View File

@ -1,7 +1,7 @@
#if defined(__linux__) || (__APPLE__ && __aarch64__)
#define USING_COUNTERS
#endif
#include "event_counter.h"
#include "counters/event_counter.h"
#include <algorithm>
#include "fast_float/fast_float.h"
#include <chrono>
@ -102,7 +102,7 @@ void pretty_print(double volume, size_t number_of_floats, std::string name,
branches_avg += branches;
branches_min = branches_min < branches ? branches_min : branches;
double branch_misses = e.missed_branches();
double branch_misses = e.branch_misses();
branch_misses_avg += branch_misses;
branch_misses_min =
branch_misses_min < branch_misses ? branch_misses_min : branch_misses;

View File

@ -1,181 +0,0 @@
#ifndef __EVENT_COUNTER_H
#define __EVENT_COUNTER_H
#include <cctype>
#ifndef _MSC_VER
#include <dirent.h>
#endif
#include <cinttypes>
#include <cstring>
#include <chrono>
#include <vector>
#include "linux-perf-events.h"
#ifdef __linux__
#include <libgen.h>
#endif
#if (defined(__APPLE__) && __APPLE__) && (defined(__aarch64__) && __aarch64__)
#include "apple_arm_events.h"
#endif
struct event_count {
std::chrono::duration<double> elapsed;
std::vector<unsigned long long> event_counts;
event_count() : elapsed(0), event_counts{0, 0, 0, 0, 0} {}
event_count(const std::chrono::duration<double> _elapsed,
const std::vector<unsigned long long> _event_counts)
: elapsed(_elapsed), event_counts(_event_counts) {}
event_count(const event_count &other)
: elapsed(other.elapsed), event_counts(other.event_counts) {}
// The types of counters (so we can read the getter more easily)
enum event_counter_types {
CPU_CYCLES = 0,
INSTRUCTIONS = 1,
BRANCHES = 2,
MISSED_BRANCHES = 3
};
double elapsed_sec() const {
return std::chrono::duration<double>(elapsed).count();
}
double elapsed_ns() const {
return std::chrono::duration<double, std::nano>(elapsed).count();
}
double cycles() const {
return static_cast<double>(event_counts[CPU_CYCLES]);
}
double instructions() const {
return static_cast<double>(event_counts[INSTRUCTIONS]);
}
double branches() const {
return static_cast<double>(event_counts[BRANCHES]);
}
double missed_branches() const {
return static_cast<double>(event_counts[MISSED_BRANCHES]);
}
event_count &operator=(const event_count &other) {
this->elapsed = other.elapsed;
this->event_counts = other.event_counts;
return *this;
}
event_count operator+(const event_count &other) const {
return event_count(elapsed + other.elapsed,
{
event_counts[0] + other.event_counts[0],
event_counts[1] + other.event_counts[1],
event_counts[2] + other.event_counts[2],
event_counts[3] + other.event_counts[3],
event_counts[4] + other.event_counts[4],
});
}
void operator+=(const event_count &other) { *this = *this + other; }
};
struct event_aggregate {
bool has_events = false;
int iterations = 0;
event_count total{};
event_count best{};
event_count worst{};
event_aggregate() = default;
void operator<<(const event_count &other) {
if (iterations == 0 || other.elapsed < best.elapsed) {
best = other;
}
if (iterations == 0 || other.elapsed > worst.elapsed) {
worst = other;
}
iterations++;
total += other;
}
double elapsed_sec() const { return total.elapsed_sec() / iterations; }
double elapsed_ns() const { return total.elapsed_ns() / iterations; }
double cycles() const { return total.cycles() / iterations; }
double instructions() const { return total.instructions() / iterations; }
double branches() const { return total.branches() / iterations; }
double missed_branches() const {
return total.missed_branches() / iterations;
}
};
struct event_collector {
event_count count{};
std::chrono::time_point<std::chrono::steady_clock> start_clock{};
#if defined(__linux__)
LinuxEvents<PERF_TYPE_HARDWARE> linux_events;
event_collector()
: linux_events(std::vector<int>{
PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS,
PERF_COUNT_HW_BRANCH_INSTRUCTIONS, // Retired branch instructions
PERF_COUNT_HW_BRANCH_MISSES}) {}
bool has_events() { return linux_events.is_working(); }
#elif __APPLE__ && __aarch64__
performance_counters diff;
event_collector() : diff(0) { setup_performance_counters(); }
bool has_events() { return setup_performance_counters(); }
#else
event_collector() {}
bool has_events() { return false; }
#endif
inline void start() {
#if defined(__linux)
linux_events.start();
#elif __APPLE__ && __aarch64__
if (has_events()) {
diff = get_counters();
}
#endif
start_clock = std::chrono::steady_clock::now();
}
inline event_count &end() {
const auto end_clock = std::chrono::steady_clock::now();
#if defined(__linux)
linux_events.end(count.event_counts);
#elif __APPLE__ && __aarch64__
if (has_events()) {
performance_counters end = get_counters();
diff = end - diff;
}
count.event_counts[0] = diff.cycles;
count.event_counts[1] = diff.instructions;
count.event_counts[2] = diff.branches;
count.event_counts[3] = diff.missed_branches;
count.event_counts[4] = 0;
#endif
count.elapsed = end_clock - start_clock;
return count;
}
};
#endif

View File

@ -1,104 +0,0 @@
#pragma once
#ifdef __linux__
#include <asm/unistd.h> // for __NR_perf_event_open
#include <linux/perf_event.h> // for perf event constants
#include <sys/ioctl.h> // for ioctl
#include <unistd.h> // for syscall
#include <cerrno> // for errno
#include <cstring> // for memset
#include <stdexcept>
#include <iostream>
#include <vector>
template <int TYPE = PERF_TYPE_HARDWARE> class LinuxEvents {
int fd;
bool working;
perf_event_attr attribs{};
size_t num_events{};
std::vector<uint64_t> temp_result_vec{};
std::vector<uint64_t> ids{};
public:
explicit LinuxEvents(std::vector<int> config_vec) : fd(0), working(true) {
memset(&attribs, 0, sizeof(attribs));
attribs.type = TYPE;
attribs.size = sizeof(attribs);
attribs.disabled = 1;
attribs.exclude_kernel = 1;
attribs.exclude_hv = 1;
attribs.sample_period = 0;
attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
const int pid = 0; // the current process
const int cpu = -1; // all CPUs
const unsigned long flags = 0;
int group = -1; // no group
num_events = config_vec.size();
ids.resize(config_vec.size());
uint32_t i = 0;
for (auto config : config_vec) {
attribs.config = config;
int _fd = static_cast<int>(
syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags));
if (_fd == -1) {
report_error("perf_event_open");
}
ioctl(_fd, PERF_EVENT_IOC_ID, &ids[i++]);
if (group == -1) {
group = _fd;
fd = _fd;
}
}
temp_result_vec.resize(num_events * 2 + 1);
}
~LinuxEvents() {
if (fd != -1) {
close(fd);
}
}
inline void start() {
if (fd != -1) {
if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
report_error("ioctl(PERF_EVENT_IOC_RESET)");
}
if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
report_error("ioctl(PERF_EVENT_IOC_ENABLE)");
}
}
}
inline void end(std::vector<unsigned long long> &results) {
if (fd != -1) {
if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
report_error("ioctl(PERF_EVENT_IOC_DISABLE)");
}
if (read(fd, temp_result_vec.data(), temp_result_vec.size() * 8) == -1) {
report_error("read");
}
}
// our actual results are in slots 1,3,5, ... of this structure
for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) {
results[i / 2] = temp_result_vec[i];
}
for (uint32_t i = 2; i < temp_result_vec.size(); i += 2) {
if (ids[i / 2 - 1] != temp_result_vec[i]) {
report_error("event mismatch");
}
}
}
bool is_working() { return working; }
private:
void report_error(const std::string &) { working = false; }
};
#endif