#ifndef RDTSC_H #define RDTSC_H #include // uint64_t #if defined(_M_X64) || defined(_M_IX86) || defined(__x86_64) || defined(__i386) # ifdef _WIN32 # include // __rdtsc # else # include // __rdtsc # endif # define HAS_HW_RDTSC 1 #else # include // std::chrono::high_resolution_clock # define HAS_HW_RDTSC 0 #endif inline uint64_t rdtsc() { #if HAS_HW_RDTSC // _mm_lfence() might be used to serialize the instruction stream, // and it would guarantee that RDTSC will not be reordered with // other instructions. However, measurements show that the overhead // may be too big (easily 15 to 30 CPU cycles) for profiling // purposes: if reordering matters, the overhead matters too! // Forbid the compiler from reordering instructions # ifdef _MSC_VER _ReadWriteBarrier(); # else __asm__ __volatile__("" : : : "memory"); # endif uint64_t result = __rdtsc(); // Forbid the compiler from reordering instructions # ifdef _MSC_VER _ReadWriteBarrier(); # else __asm__ __volatile__("" : : : "memory"); # endif return result; #else auto now = std::chrono::high_resolution_clock::now(); return std::chrono::duration_cast( now.time_since_epoch()) .count(); #endif } #endif // RDTSC_H