A complete unfailing solution to thread scheduling, which should yield exactly the same times per each test, is to compile your program to be OS independent and boot up your computer so as to run the program in an OS-free environment. Yet, this is largely impractical and would be difficult at best.
A good substitute to going OS-free is just to set the affinity of the current thread to 1 core and the priority to the highest. This alternative should provide consistent-enough results.
Assuming that you use -Ofast
(or, at the very least, -O3
) on the final production build and ignoring the issue of "dead" code elimination, -Og
performs very few optimizations compared to -Ofast
; thus -Og
can misrepresent the real speed of the code in the final product.
Further, all speed tests (to some extent) perjure: in the final production product compiled with -Ofast
, each snippet/section/function of code is not isolated; rather, each snippet of code continuously flows into the next, thus allowing the compiler to potential join, merge, and optimize together pieces of code from all over the place.
At the same time, if you are benchmarking a snippet of code which makes heavy use of realloc()
, then the snippet of code might run slower in a production product with high enough memory fragmentation. Hence, the expression "the whole is more than the sum of its parts" applies to this situation because code in the final production build might run noticeably faster or slower than the individual snippet which you are speed testing.
A partial solution that may lessen the incongruity is using -Ofast
for speed testing WITH the addition of asm volatile("" :: "r"(var))
to the variables involved in the test for preventing dead code/loop elimination.
Here is an example of how to benchmark square root functions on a Windows computer.
// set USE_ASM_TO_PREVENT_ELIMINATION to 0 to prevent `asm volatile("" :: "r"(var))`
// set USE_ASM_TO_PREVENT_ELIMINATION to 1 to enforce `asm volatile("" :: "r"(var))`
#define USE_ASM_TO_PREVENT_ELIMINATION 1
#include <iostream>
#include <iomanip>
#include <cstdio>
#include <chrono>
#include <cmath>
#include <windows.h>
#include <intrin.h>
#pragma intrinsic(__rdtsc)
#include <cstdint>
class Timer {
public:
Timer() : beg_(clock_::now()) {}
void reset() { beg_ = clock_::now(); }
double elapsed() const {
return std::chrono::duration_cast<second_>
(clock_::now() - beg_).count(); }
private:
typedef std::chrono::high_resolution_clock clock_;
typedef std::chrono::duration<double, std::ratio<1> > second_;
std::chrono::time_point<clock_> beg_;
};
unsigned int guess_sqrt32(register unsigned int n) {
register unsigned int g = 0x8000;
if(g*g > n) {
g ^= 0x8000;
}
g |= 0x4000;
if(g*g > n) {
g ^= 0x4000;
}
g |= 0x2000;
if(g*g > n) {
g ^= 0x2000;
}
g |= 0x1000;
if(g*g > n) {
g ^= 0x1000;
}
g |= 0x0800;
if(g*g > n) {
g ^= 0x0800;
}
g |= 0x0400;
if(g*g > n) {
g ^= 0x0400;
}
g |= 0x0200;
if(g*g > n) {
g ^= 0x0200;
}
g |= 0x0100;
if(g*g > n) {
g ^= 0x0100;
}
g |= 0x0080;
if(g*g > n) {
g ^= 0x0080;
}
g |= 0x0040;
if(g*g > n) {
g ^= 0x0040;
}
g |= 0x0020;
if(g*g > n) {
g ^= 0x0020;
}
g |= 0x0010;
if(g*g > n) {
g ^= 0x0010;
}
g |= 0x0008;
if(g*g > n) {
g ^= 0x0008;
}
g |= 0x0004;
if(g*g > n) {
g ^= 0x0004;
}
g |= 0x0002;
if(g*g > n) {
g ^= 0x0002;
}
g |= 0x0001;
if(g*g > n) {
g ^= 0x0001;
}
return g;
}
unsigned int empty_function( unsigned int _input ) {
return _input;
}
unsigned long long empty_ticks=0;
double empty_seconds=0;
Timer my_time;
template<unsigned int benchmark_repetitions>
void benchmark( char* function_name, auto (*function_to_do)( auto ) ) {
register unsigned int i=benchmark_repetitions;
register unsigned long long start=0;
my_time.reset();
start=__rdtsc();
while ( i-- ) {
auto result = (*function_to_do)( i << 7 );
#if USE_ASM_TO_PREVENT_ELIMINATION == 1
asm volatile("" :: "r"(
// There is no data type in C++ that is smaller than a char, so it will
// not throw a segmentation fault error to reinterpret any arbitrary
// data type as a char. Although, the compiler might not like it.
result
));
#endif
}
if ( function_name == nullptr ) {
empty_ticks = (__rdtsc()-start);
empty_seconds = my_time.elapsed();
std::cout<< "Empty:\n" << empty_ticks
<< " ticks\n" << benchmark_repetitions << " repetitions\n"
<< std::setprecision(15) << empty_seconds
<< " seconds\n\n";
} else {
std::cout<< function_name<<":\n" << (__rdtsc()-start-empty_ticks)
<< " ticks\n" << benchmark_repetitions << " repetitions\n"
<< std::setprecision(15) << (my_time.elapsed()-empty_seconds)
<< " seconds\n\n";
}
}
int main( void ) {
void* Cur_Thread= GetCurrentThread();
void* Cur_Process= GetCurrentProcess();
unsigned long long Current_Affinity;
unsigned long long System_Affinity;
unsigned long long furthest_affinity;
unsigned long long nearest_affinity;
if( ! SetThreadPriority(Cur_Thread,THREAD_PRIORITY_TIME_CRITICAL) ) {
SetThreadPriority( Cur_Thread, THREAD_PRIORITY_HIGHEST );
}
if( ! SetPriorityClass(Cur_Process,REALTIME_PRIORITY_CLASS) ) {
SetPriorityClass( Cur_Process, HIGH_PRIORITY_CLASS );
}
GetProcessAffinityMask( Cur_Process, &Current_Affinity, &System_Affinity );
furthest_affinity = 0x8000000000000000ULL>>__builtin_clzll(Current_Affinity);
nearest_affinity = 0x0000000000000001ULL<<__builtin_ctzll(Current_Affinity);
SetProcessAffinityMask( Cur_Process, furthest_affinity );
SetThreadAffinityMask( Cur_Thread, furthest_affinity );
const int repetitions=524288;
benchmark<repetitions>( nullptr, empty_function );
benchmark<repetitions>( "Standard Square Root", standard_sqrt );
benchmark<repetitions>( "Original Guess Square Root", original_guess_sqrt32 );
benchmark<repetitions>( "New Guess Square Root", new_guess_sqrt32 );
SetThreadPriority( Cur_Thread, THREAD_PRIORITY_IDLE );
SetPriorityClass( Cur_Process, IDLE_PRIORITY_CLASS );
SetProcessAffinityMask( Cur_Process, nearest_affinity );
SetThreadAffinityMask( Cur_Thread, nearest_affinity );
for (;;) { getchar(); }
return 0;
}
Also, credit to Mike Jarvis for his Timer.
Please note (this is very important) that if you are going to be running bigger code snippets, then you really must turn down the number of iterations to prevent your computer from freezing up.