This commit is contained in:
Daniel Lemire 2024-11-25 11:59:48 -05:00
parent 3f2cd66c1c
commit 8832c532b8
4 changed files with 125 additions and 127 deletions

View File

@ -101,8 +101,6 @@ inline performance_counters operator-(const performance_counters &a,
a.instructions - b.instructions); a.instructions - b.instructions);
} }
typedef float f32; typedef float f32;
typedef double f64; typedef double f64;
typedef int8_t i8; typedef int8_t i8;
@ -616,9 +614,7 @@ typedef struct {
#define lib_nelems(x) (sizeof(x) / sizeof((x)[0])) #define lib_nelems(x) (sizeof(x) / sizeof((x)[0]))
#define lib_symbol_def(name) \ #define lib_symbol_def(name) \
{ \ { #name, (void **)&name }
#name, (void **)&name \
}
static const lib_symbol lib_symbols_kperf[] = { static const lib_symbol lib_symbols_kperf[] = {
lib_symbol_def(kpc_pmu_version), lib_symbol_def(kpc_pmu_version),
@ -933,7 +929,7 @@ typedef struct {
static const event_alias profile_events[] = { static const event_alias profile_events[] = {
{"cycles", {"cycles",
{ {
"FIXED_CYCLES", // Apple A7-A15//CORE_ACTIVE_CYCLE "FIXED_CYCLES", // Apple A7-A15//CORE_ACTIVE_CYCLE
"CPU_CLK_UNHALTED.THREAD", // Intel Core 1th-10th "CPU_CLK_UNHALTED.THREAD", // Intel Core 1th-10th
"CPU_CLK_UNHALTED.CORE", // Intel Yonah, Merom "CPU_CLK_UNHALTED.CORE", // Intel Yonah, Merom
}}, }},
@ -976,7 +972,6 @@ u64 counters_0[KPC_MAX_COUNTERS] = {0};
u64 counters_1[KPC_MAX_COUNTERS] = {0}; u64 counters_1[KPC_MAX_COUNTERS] = {0};
const usize ev_count = sizeof(profile_events) / sizeof(profile_events[0]); const usize ev_count = sizeof(profile_events) / sizeof(profile_events[0]);
bool setup_performance_counters() { bool setup_performance_counters() {
static bool init = false; static bool init = false;
static bool worked = false; static bool worked = false;
@ -995,7 +990,7 @@ bool setup_performance_counters() {
// check permission // check permission
int force_ctrs = 0; int force_ctrs = 0;
if (kpc_force_all_ctrs_get(&force_ctrs)) { if (kpc_force_all_ctrs_get(&force_ctrs)) {
//printf("Permission denied, xnu/kpc requires root privileges.\n"); // printf("Permission denied, xnu/kpc requires root privileges.\n");
return (worked = false); return (worked = false);
} }
int ret; int ret;
@ -1101,17 +1096,16 @@ inline performance_counters get_counters() {
} }
return 1; return 1;
} }
/*printf("counters value:\n"); /*printf("counters value:\n");
for (usize i = 0; i < ev_count; i++) { for (usize i = 0; i < ev_count; i++) {
const event_alias *alias = profile_events + i; const event_alias *alias = profile_events + i;
usize idx = counter_map[i]; usize idx = counter_map[i];
u64 val = counters_1[idx] - counters_0[idx]; u64 val = counters_1[idx] - counters_0[idx];
printf("%14s: %llu\n", alias->alias, val); printf("%14s: %llu\n", alias->alias, val);
}*/ }*/
return performance_counters{ return performance_counters{
counters_0[counter_map[0]], counters_0[counter_map[2]], counters_0[counter_map[0]], counters_0[counter_map[2]],
counters_0[counter_map[3]], counters_0[counter_map[3]], counters_0[counter_map[1]]};
counters_0[counter_map[1]]};
} }
#endif #endif

View File

@ -1,4 +1,4 @@
#if defined(__linux__) || (__APPLE__ && __aarch64__) #if defined(__linux__) || (__APPLE__ && __aarch64__)
#define USING_COUNTERS #define USING_COUNTERS
#include "event_counter.h" #include "event_counter.h"
#endif #endif
@ -22,7 +22,6 @@
#include <vector> #include <vector>
#include <locale.h> #include <locale.h>
template <typename CharT> template <typename CharT>
double findmax_fastfloat64(std::vector<std::basic_string<CharT>> &s) { double findmax_fastfloat64(std::vector<std::basic_string<CharT>> &s) {
double answer = 0; double answer = 0;
@ -55,8 +54,9 @@ event_collector collector{};
#ifdef USING_COUNTERS #ifdef USING_COUNTERS
template <class T, class CharT> template <class T, class CharT>
std::vector<event_count> time_it_ns(std::vector<std::basic_string<CharT>> &lines, std::vector<event_count>
T const &function, size_t repeat) { time_it_ns(std::vector<std::basic_string<CharT>> &lines, T const &function,
size_t repeat) {
std::vector<event_count> aggregate; std::vector<event_count> aggregate;
bool printed_bug = false; bool printed_bug = false;
for (size_t i = 0; i < repeat; i++) { for (size_t i = 0; i < repeat; i++) {
@ -71,7 +71,8 @@ std::vector<event_count> time_it_ns(std::vector<std::basic_string<CharT>> &lines
return aggregate; return aggregate;
} }
void pretty_print(double volume, size_t number_of_floats, std::string name, std::vector<event_count> events) { void pretty_print(double volume, size_t number_of_floats, std::string name,
std::vector<event_count> events) {
double volumeMB = volume / (1024. * 1024.); double volumeMB = volume / (1024. * 1024.);
double average_ns{0}; double average_ns{0};
double min_ns{DBL_MAX}; double min_ns{DBL_MAX};
@ -83,7 +84,7 @@ void pretty_print(double volume, size_t number_of_floats, std::string name, std:
double branches_avg{0}; double branches_avg{0};
double branch_misses_min{0}; double branch_misses_min{0};
double branch_misses_avg{0}; double branch_misses_avg{0};
for(event_count e : events) { for (event_count e : events) {
double ns = e.elapsed_ns(); double ns = e.elapsed_ns();
average_ns += ns; average_ns += ns;
min_ns = min_ns < ns ? min_ns : ns; min_ns = min_ns < ns ? min_ns : ns;
@ -94,7 +95,8 @@ void pretty_print(double volume, size_t number_of_floats, std::string name, std:
double instructions = e.instructions(); double instructions = e.instructions();
instructions_avg += instructions; instructions_avg += instructions;
instructions_min = instructions_min < instructions ? instructions_min : instructions; instructions_min =
instructions_min < instructions ? instructions_min : instructions;
double branches = e.branches(); double branches = e.branches();
branches_avg += branches; branches_avg += branches;
@ -102,43 +104,37 @@ void pretty_print(double volume, size_t number_of_floats, std::string name, std:
double branch_misses = e.missed_branches(); double branch_misses = e.missed_branches();
branch_misses_avg += branch_misses; branch_misses_avg += branch_misses;
branch_misses_min = branch_misses_min < branch_misses ? branch_misses_min : branch_misses; branch_misses_min =
branch_misses_min < branch_misses ? branch_misses_min : branch_misses;
} }
cycles_avg /= events.size(); cycles_avg /= events.size();
instructions_avg /= events.size(); instructions_avg /= events.size();
average_ns /= events.size(); average_ns /= events.size();
branches_avg /= events.size(); branches_avg /= events.size();
printf("%-40s: %8.2f MB/s (+/- %.1f %%) ", name.data(), printf("%-40s: %8.2f MB/s (+/- %.1f %%) ", name.data(),
volumeMB * 1000000000 / min_ns, volumeMB * 1000000000 / min_ns,
(average_ns - min_ns) * 100.0 / average_ns); (average_ns - min_ns) * 100.0 / average_ns);
printf("%8.2f Mfloat/s ", printf("%8.2f Mfloat/s ", number_of_floats * 1000 / min_ns);
number_of_floats * 1000 / min_ns); if (instructions_min > 0) {
if(instructions_min > 0) { printf(" %8.2f i/B %8.2f i/f (+/- %.1f %%) ", instructions_min / volume,
printf(" %8.2f i/B %8.2f i/f (+/- %.1f %%) ",
instructions_min / volume,
instructions_min / number_of_floats, instructions_min / number_of_floats,
(instructions_avg - instructions_min) * 100.0 / instructions_avg); (instructions_avg - instructions_min) * 100.0 / instructions_avg);
printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%) ", printf(" %8.2f c/B %8.2f c/f (+/- %.1f %%) ", cycles_min / volume,
cycles_min / volume,
cycles_min / number_of_floats, cycles_min / number_of_floats,
(cycles_avg - cycles_min) * 100.0 / cycles_avg); (cycles_avg - cycles_min) * 100.0 / cycles_avg);
printf(" %8.2f i/c ", printf(" %8.2f i/c ", instructions_min / cycles_min);
instructions_min /cycles_min); printf(" %8.2f b/f ", branches_avg / number_of_floats);
printf(" %8.2f b/f ", printf(" %8.2f bm/f ", branch_misses_avg / number_of_floats);
branches_avg /number_of_floats); printf(" %8.2f GHz ", cycles_min / min_ns);
printf(" %8.2f bm/f ",
branch_misses_avg /number_of_floats);
printf(" %8.2f GHz ",
cycles_min / min_ns);
} }
printf("\n"); printf("\n");
} }
#else #else
template <class T, class CharT> template <class T, class CharT>
std::pair<double, double> time_it_ns(std::vector<std::basic_string<CharT>> &lines, std::pair<double, double>
T const &function, size_t repeat) { time_it_ns(std::vector<std::basic_string<CharT>> &lines, T const &function,
size_t repeat) {
std::chrono::high_resolution_clock::time_point t1, t2; std::chrono::high_resolution_clock::time_point t1, t2;
double average = 0; double average = 0;
double min_value = DBL_MAX; double min_value = DBL_MAX;
@ -160,22 +156,17 @@ std::pair<double, double> time_it_ns(std::vector<std::basic_string<CharT>> &line
return std::make_pair(min_value, average); return std::make_pair(min_value, average);
} }
void pretty_print(double volume, size_t number_of_floats, std::string name,
std::pair<double, double> result) {
void pretty_print(double volume, size_t number_of_floats, std::string name, std::pair<double,double> result) {
double volumeMB = volume / (1024. * 1024.); double volumeMB = volume / (1024. * 1024.);
printf("%-40s: %8.2f MB/s (+/- %.1f %%) ", name.data(), printf("%-40s: %8.2f MB/s (+/- %.1f %%) ", name.data(),
volumeMB * 1000000000 / result.first, volumeMB * 1000000000 / result.first,
(result.second - result.first) * 100.0 / result.second); (result.second - result.first) * 100.0 / result.second);
printf("%8.2f Mfloat/s ", printf("%8.2f Mfloat/s ", number_of_floats * 1000 / result.first);
number_of_floats * 1000 / result.first); printf(" %8.2f ns/f \n", double(result.first) / number_of_floats);
printf(" %8.2f ns/f \n",
double(result.first) /number_of_floats );
} }
#endif #endif
// this is okay, all chars are ASCII // this is okay, all chars are ASCII
inline std::u16string widen(std::string line) { inline std::u16string widen(std::string line) {
std::u16string u16line; std::u16string u16line;
@ -195,21 +186,23 @@ std::vector<std::u16string> widen(const std::vector<std::string> &lines) {
return u16lines; return u16lines;
} }
void process(std::vector<std::string> &lines, size_t volume) { void process(std::vector<std::string> &lines, size_t volume) {
size_t repeat = 100; size_t repeat = 100;
double volumeMB = volume / (1024. * 1024.); double volumeMB = volume / (1024. * 1024.);
std::cout << "ASCII volume = " << volumeMB << " MB " << std::endl; std::cout << "ASCII volume = " << volumeMB << " MB " << std::endl;
pretty_print(volume, lines.size(), "fastfloat (64)", time_it_ns(lines, findmax_fastfloat64<char>, repeat)); pretty_print(volume, lines.size(), "fastfloat (64)",
pretty_print(volume, lines.size(), "fastfloat (32)", time_it_ns(lines, findmax_fastfloat32<char>, repeat)); time_it_ns(lines, findmax_fastfloat64<char>, repeat));
pretty_print(volume, lines.size(), "fastfloat (32)",
time_it_ns(lines, findmax_fastfloat32<char>, repeat));
std::vector<std::u16string> lines16 = widen(lines); std::vector<std::u16string> lines16 = widen(lines);
volume = 2 * volume; volume = 2 * volume;
volumeMB = volume / (1024. * 1024.); volumeMB = volume / (1024. * 1024.);
std::cout << "UTF-16 volume = " << volumeMB << " MB " << std::endl; std::cout << "UTF-16 volume = " << volumeMB << " MB " << std::endl;
pretty_print(volume, lines.size(), "fastfloat (64)", time_it_ns(lines16, findmax_fastfloat64<char16_t>, repeat)); pretty_print(volume, lines.size(), "fastfloat (64)",
pretty_print(volume, lines.size(), "fastfloat (32)", time_it_ns(lines16, findmax_fastfloat32<char16_t>, repeat)); time_it_ns(lines16, findmax_fastfloat64<char16_t>, repeat));
pretty_print(volume, lines.size(), "fastfloat (32)",
time_it_ns(lines16, findmax_fastfloat32<char16_t>, repeat));
} }
void fileload(std::string filename) { void fileload(std::string filename) {
@ -233,13 +226,14 @@ void fileload(std::string filename) {
process(lines, volume); process(lines, volume);
} }
int main(int argc, char **argv) { int main(int argc, char **argv) {
if(collector.has_events()) { if (collector.has_events()) {
std::cout << "# Using hardware counters" << std::endl; std::cout << "# Using hardware counters" << std::endl;
} else { } else {
#if defined(__linux__) || (__APPLE__ && __aarch64__) #if defined(__linux__) || (__APPLE__ && __aarch64__)
std::cout << "# Hardware counters not available, try to run in privileged mode (e.g., sudo)." << std::endl; std::cout << "# Hardware counters not available, try to run in privileged "
"mode (e.g., sudo)."
<< std::endl;
#endif #endif
} }
fileload(std::string(BENCHMARK_DATA_DIR) + "/canada.txt"); fileload(std::string(BENCHMARK_DATA_DIR) + "/canada.txt");

View File

@ -17,16 +17,19 @@
#include <libgen.h> #include <libgen.h>
#endif #endif
#if __APPLE__ && __aarch64__ #if __APPLE__ && __aarch64__
#include "apple_arm_events.h" #include "apple_arm_events.h"
#endif #endif
struct event_count { struct event_count {
std::chrono::duration<double> elapsed; std::chrono::duration<double> elapsed;
std::vector<unsigned long long> event_counts; std::vector<unsigned long long> event_counts;
event_count() : elapsed(0), event_counts{0,0,0,0,0} {} event_count() : elapsed(0), event_counts{0, 0, 0, 0, 0} {}
event_count(const std::chrono::duration<double> _elapsed, const std::vector<unsigned long long> _event_counts) : elapsed(_elapsed), event_counts(_event_counts) {} event_count(const std::chrono::duration<double> _elapsed,
event_count(const event_count& other): elapsed(other.elapsed), event_counts(other.event_counts) { } const std::vector<unsigned long long> _event_counts)
: elapsed(_elapsed), event_counts(_event_counts) {}
event_count(const event_count &other)
: elapsed(other.elapsed), event_counts(other.event_counts) {}
// The types of counters (so we can read the getter more easily) // The types of counters (so we can read the getter more easily)
enum event_counter_types { enum event_counter_types {
@ -36,31 +39,42 @@ struct event_count {
MISSED_BRANCHES = 3 MISSED_BRANCHES = 3
}; };
double elapsed_sec() const { return std::chrono::duration<double>(elapsed).count(); } double elapsed_sec() const {
double elapsed_ns() const { return std::chrono::duration<double, std::nano>(elapsed).count(); } return std::chrono::duration<double>(elapsed).count();
double cycles() const { return static_cast<double>(event_counts[CPU_CYCLES]); } }
double instructions() const { return static_cast<double>(event_counts[INSTRUCTIONS]); } double elapsed_ns() const {
double branches() const { return static_cast<double>(event_counts[BRANCHES]); } return std::chrono::duration<double, std::nano>(elapsed).count();
double missed_branches() const { return static_cast<double>(event_counts[MISSED_BRANCHES]); } }
double cycles() const {
return static_cast<double>(event_counts[CPU_CYCLES]);
}
double instructions() const {
return static_cast<double>(event_counts[INSTRUCTIONS]);
}
double branches() const {
return static_cast<double>(event_counts[BRANCHES]);
}
double missed_branches() const {
return static_cast<double>(event_counts[MISSED_BRANCHES]);
}
event_count& operator=(const event_count& other) { event_count &operator=(const event_count &other) {
this->elapsed = other.elapsed; this->elapsed = other.elapsed;
this->event_counts = other.event_counts; this->event_counts = other.event_counts;
return *this; return *this;
} }
event_count operator+(const event_count& other) const { event_count operator+(const event_count &other) const {
return event_count(elapsed+other.elapsed, { return event_count(elapsed + other.elapsed,
event_counts[0]+other.event_counts[0], {
event_counts[1]+other.event_counts[1], event_counts[0] + other.event_counts[0],
event_counts[2]+other.event_counts[2], event_counts[1] + other.event_counts[1],
event_counts[3]+other.event_counts[3], event_counts[2] + other.event_counts[2],
event_counts[4]+other.event_counts[4], event_counts[3] + other.event_counts[3],
}); event_counts[4] + other.event_counts[4],
});
} }
void operator+=(const event_count& other) { void operator+=(const event_count &other) { *this = *this + other; }
*this = *this + other;
}
}; };
struct event_aggregate { struct event_aggregate {
@ -72,7 +86,7 @@ struct event_aggregate {
event_aggregate() = default; event_aggregate() = default;
void operator<<(const event_count& other) { void operator<<(const event_count &other) {
if (iterations == 0 || other.elapsed < best.elapsed) { if (iterations == 0 || other.elapsed < best.elapsed) {
best = other; best = other;
} }
@ -88,7 +102,9 @@ struct event_aggregate {
double cycles() const { return total.cycles() / iterations; } double cycles() const { return total.cycles() / iterations; }
double instructions() const { return total.instructions() / iterations; } double instructions() const { return total.instructions() / iterations; }
double branches() const { return total.branches() / iterations; } double branches() const { return total.branches() / iterations; }
double missed_branches() const { return total.missed_branches() / iterations; } double missed_branches() const {
return total.missed_branches() / iterations;
}
}; };
struct event_collector { struct event_collector {
@ -97,44 +113,37 @@ struct event_collector {
#if defined(__linux__) #if defined(__linux__)
LinuxEvents<PERF_TYPE_HARDWARE> linux_events; LinuxEvents<PERF_TYPE_HARDWARE> linux_events;
event_collector() : linux_events(std::vector<int>{ event_collector()
PERF_COUNT_HW_CPU_CYCLES, : linux_events(std::vector<int>{
PERF_COUNT_HW_INSTRUCTIONS, PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS,
PERF_COUNT_HW_BRANCH_INSTRUCTIONS, // Retired branch instructions PERF_COUNT_HW_BRANCH_INSTRUCTIONS, // Retired branch instructions
PERF_COUNT_HW_BRANCH_MISSES PERF_COUNT_HW_BRANCH_MISSES}) {}
}) {} bool has_events() { return linux_events.is_working(); }
bool has_events() { #elif __APPLE__ && __aarch64__
return linux_events.is_working();
}
#elif __APPLE__ && __aarch64__
performance_counters diff; performance_counters diff;
event_collector() : diff(0) { event_collector() : diff(0) { setup_performance_counters(); }
setup_performance_counters(); bool has_events() { return setup_performance_counters(); }
}
bool has_events() {
return setup_performance_counters();
}
#else #else
event_collector() {} event_collector() {}
bool has_events() { bool has_events() { return false; }
return false;
}
#endif #endif
inline void start() { inline void start() {
#if defined(__linux) #if defined(__linux)
linux_events.start(); linux_events.start();
#elif __APPLE__ && __aarch64__ #elif __APPLE__ && __aarch64__
if(has_events()) { diff = get_counters(); } if (has_events()) {
diff = get_counters();
}
#endif #endif
start_clock = std::chrono::steady_clock::now(); start_clock = std::chrono::steady_clock::now();
} }
inline event_count& end() { inline event_count &end() {
const auto end_clock = std::chrono::steady_clock::now(); const auto end_clock = std::chrono::steady_clock::now();
#if defined(__linux) #if defined(__linux)
linux_events.end(count.event_counts); linux_events.end(count.event_counts);
#elif __APPLE__ && __aarch64__ #elif __APPLE__ && __aarch64__
if(has_events()) { if (has_events()) {
performance_counters end = get_counters(); performance_counters end = get_counters();
diff = end - diff; diff = end - diff;
} }

View File

@ -42,7 +42,8 @@ public:
uint32_t i = 0; uint32_t i = 0;
for (auto config : config_vec) { for (auto config : config_vec) {
attribs.config = config; attribs.config = config;
int _fd = static_cast<int>(syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags)); int _fd = static_cast<int>(
syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags));
if (_fd == -1) { if (_fd == -1) {
report_error("perf_event_open"); report_error("perf_event_open");
} }
@ -56,7 +57,11 @@ public:
temp_result_vec.resize(num_events * 2 + 1); temp_result_vec.resize(num_events * 2 + 1);
} }
~LinuxEvents() { if (fd != -1) { close(fd); } } ~LinuxEvents() {
if (fd != -1) {
close(fd);
}
}
inline void start() { inline void start() {
if (fd != -1) { if (fd != -1) {
@ -85,19 +90,15 @@ public:
results[i / 2] = temp_result_vec[i]; results[i / 2] = temp_result_vec[i];
} }
for (uint32_t i = 2; i < temp_result_vec.size(); i += 2) { for (uint32_t i = 2; i < temp_result_vec.size(); i += 2) {
if(ids[i/2-1] != temp_result_vec[i]) { if (ids[i / 2 - 1] != temp_result_vec[i]) {
report_error("event mismatch"); report_error("event mismatch");
} }
} }
} }
bool is_working() { bool is_working() { return working; }
return working;
}
private: private:
void report_error(const std::string &) { void report_error(const std::string &) { working = false; }
working = false;
}
}; };
#endif #endif