// Overview / Examples / API / FAQ / Resources
Performance
is not a number!
Single
header
/module
performance
library that combines the power of:
c++23
,linux/perf
,llvm/mca
,gnuplot/sixel
, ...
Profiling, Tracing, Analyzing, Plotting, Testing, Benchmarking
namespace description API info
hardware/software info compiler
,cpu
,memory
,sys
,proc
,bin
core
low-level code
,compiler
,cpu
,memory
mc
disassembling (llvm) assembly
,address
,encoding
,size
,uops
,latency
,rthroughput
,may_load
,may_store
,has_side_effects
, ...,source
mca
analyzing (llvm/mca) cycles
,instructions
,uops
,timeline
,resource_pressure
,bottleneck
time
timing (rdtsc/clock/chrono) tsc
,cpu
,thread
,real
,monotonic
,steady_clock
,high_resolution_clock
stat
counting (linux/perf) instructions
,cycles
, ...,top_down
record
sampling (linux/perf) instructions
,cycles
, ...,mem_loads
,mem_stores
,top_down
trace
tracing (linux/intel_pt) traces
,cycles
,tsc
bench
benchmarking baseline
,latency
,throughput
io
logging/plotting (gnuplot/sixel) log
,spec
,json
,report
,annotate
,plot
(hist
,box
,bar
,line
,ecdf
)
Optimal
- (
clang-19+
|gcc-13+
) /c++23+
llvm-19+
-apt-get install llvm-dev
linux-6.x+
perf-event-open
-apt-get install linux-tools-common
intel-12th+
withPEBS
,IPT
support
libipt
-apt-get install libipt-dev
terminal
withsixel
support
gnuplot
-apt-get install gnuplot
Auxiliary
gh
-apt-get install gh
prof
-https://github.com/qlibs/prof
linux-perf
-apt get install linux-tools-common
intel-vtune
-apt get install intel-oneapi-vtune
amd-uprof
-https://www.amd.com/en/developer/uprof.html#downloads
gperftools
-apt get install google-perftools
llvm-xray
-apt-get install llvm
callgrind
-apt-get install valgrind
ut
-https://github.com/qlibs/ut
uefi
-https://github.com/qlibs/uefi
Info/Core
info::compiler::name
assert(perf::info::compiler::name() == "clang"s);
info::compiler::version
assert(perf::info::compiler::version() == perf::info::sem_ver{.major = 20, .minor = 0, .patch = 0});
info::cpu::name
assert(perf::info::cpu::name() == "12th Gen Intel(R) Core(TM) i7-12650"s);
info::cpu::code_name
assert(perf::info::cpu::code_name() == "alderlake"s);
info::cpu::version
assert(perf::info::cpu::version() == perf::info::cpu::cpu_ver{.family = 6, .model = 154, .stepping = 3});
info::cpu::features
assert(perf::info::cpu::features() == std::vector{"avx", "avx2", "bmi", ...});
info/memory
assert(perf::info::memory::icache() == std::map{{level::L1, {.size = 32768, .line_size = 64, .assoc = 8}}}); assert(perf::info::memory::dcache() == std::map{{level::L1, {.size = 49152, .line_size = 64, .assoc = 12}}, ...});
info/sys
assert(perf::info::sys::name() == "linux"s); assert(perf::info::sys::triple() == "x86_64-pc-linux-gnu"s); assert(perf::info::sys::page_size() == 4096b);
info/proc
assert(perf::info::proc::self::name() == "/tmp/perf.out"s); assert(perf::info::proc::self::base_address() > 0u);
info/bin
static auto fn = [] {}; auto&& fn_name = perf::info::bin::addr_to_fn_name( perf::info::proc::self::name(), std::uint64_t(&fn) - perf::info::proc::self::base_address() ); assert(fn_name.has_value() and *fn_name == "fn"s);static auto var = 0; auto&& var_name = perf::info::bin::addr_to_name( perf::info::proc::self::name(), std::uint64_t(&var) - perf::info::proc::self::base_address() ); assert(var_name.has_value() and *var_name == "var"s);// addr_to_line # requires debug symbols (-g) label:; auto&& source = perf::info::bin::addr_to_line( perf::info::proc::self::name(), std::uint64_t(&&label) - perf::info::proc::self::base_address() ); assert(source.has_value() and source->contains("source ="));
core/memory
const std::array add{ // x86-64 0x89, 0xf8, // mov eax, edi 0x01, 0xf0, // add eax, esi 0xc3 // ret }; memory::protect(std::span(add), memory::protection::read | memory::protection::write | memory::protection::exec)); assert(invoke(add, 1, 2) == 3); assert(invoke(add, 2, 3) == 5);
core/compiler
// prevent_elision/is_elided assert(perf::compiler::is_elided([] { })); assert(perf::compiler::is_elided([] { int i{}; i++; })); assert(not perf::compiler::is_elided([] { int i{}; perf::compiler::prevent_elision(i++); }));Analyzing
backend/analyzer
Profiling
prof/timer
perf::time::timer t{perf::time::steady_clock}; t.start(); fn(); t.stop(); assert(t[perf::time::steady_clock] > perf::time::duration<perf::time::ns>(0.));perf::time::timer t{perf::time::steady_clock, perf::time::cpu}; t.start(); fn(); t.stop(); assert(t[perf::time::steady_clock] > perf::time::duration<perf::time::ns>(0.)); assert(t[perf::time::cpu] > perf::time::duration<perf::time::ns>(0.)); // `t[]` - returns std::tuple of all timers assert(std::get<0u>(t[]) > perf::time::duration<perf::time::ns>(0.)); // steady_clock assert(std::get<1u>(t[]) > perf::time::duration<perf::time::ns>(0.)); // time_cpuperf::time::steady_clock - monotonic-time perf::time::high_resolution_clock - highest available resolution clock perf::time::cpu - user-time + sys-time perf::time::thread - cpu-time for the current thread perf::time::real - wall-time perf::time::monotonic - guranateed to be always increasing perf::time::tsc - time-stamp-counter
prof/counter
// metrics/dsl // top_down
prof/sampler
Tracing
prof/tracer
prof/trace
Plotting
plot/gnuplot
Note: See
Benchmarking
section forbench
related plottingTesting
utility/verify
perf::verify(fn...assembly); perf::verify(fn...cycles);
test
perf::test(); perf::test({.verbose = true});
-DNTEST
- disables compile-time/run-time tests import ut; "benchmark"_test = [] { // ... };Benchmarking
runner
auto fizz_buzz = [](int n) { if (n % 15 == 0) { return "FizzBuzz"; } else if (n % 3 == 0) { return "Fizz"; } else if (n % 5 == 0) { return "Buzz"; } else { return "Unknown"; } }; perf::runner bench{perf::bench::latency{}}; // what and how bench(fizz_buzz, 15); bench(fizz_buzz, 3); bench(fizz_buzz, 5); perf::report(bench[perf::time::steady_clock, perf::bench::operations, perf::bench::samples]); // total time
^^bench[...] == ^^std::vector<perf::name, std::tuple<named<Name, std::vector<Ts>...>>>
data
bench(fizz_buzz, perf::data::sequence<int>{{3,5,15}}); bench(fizz_buzz, perf::data::uniform<int>{.min = 0, .max = 15}); // choice
latency
vs.throughput
auto add = [](int a, int b) { return a + b; }; auto sub = [](int a, int b) { return a - b; }; auto mult = [](int a, int b) { return a * b; }; auto div = [](int a, int b) { return a / b; }; perf::runner bench{perf::bench::latency{}}; bench(add, 0, 0); // invoke(add, 0, 0) bench(sub, 0, 0); // invoke(sub, 0, 0) bench(mult, 0, 0); // invoke(mult, 0, 0) bench(div, 0, 0); // invoke(div, 0, 0) using perf::dsl::operator/; perf::report(bench[perf::time::tsc / perf::bench::operations, // ns/op perf::stat::cycles / perf::bench::operations] // cyc/op );inline constexpr auto latency = perf::time::steady_clock / perf::bench::operations; inline constexpr auto throughput = perf::bench::operations / perf::time::steady_clock; inline constexpr auto inverse_throughput = perf::time::steady_clock / perf::bench::operations;
bench/policy
bench/baseline
bench/debug
io/report
io/annotate
io/plot
// perf::plot::hist // perf::plot::bar // perf::plot::box // perf::plot::line // perf::plot::ecdf // complexityExporting/Sharing
.github/scripts/export.sh
Configuration
/** * PERF version # https://semver.org */ #define PERF (MAJOR, MINOR, PATCH) // ex. (1, 0, 0) /** * GNU # default: deduced based on `__GNUC__` * - 0 not compatible * - 1 compatible */ #define PERF_GNU 0/1 /** * Linux # default: deduced based on `__linux__` * - 0 not supported * - 1 supported */ #define PERF_LINUX 0/1 /** * LLVM # default: deduced based on `llvm-dev` headers * - 0 not supported * - 1 supported */ #define PERF_LLVM 0/1 /** * Intel Processor Trace # default: deduced based on `intel_pt` headers * - 0 not supported * - 1 supported */ #define PERF_INTEL 0/1 /** * I/O support # default: 1 * - 0 not compiled in * - 1 supported (`log, json, report, annotate, plot`) */ #define PERF_IO 0/1 /** * tests # default: not-defined * - defined: disables all compile-time, run-time tests * - not-defined: compile-time tests executed, * run-time tests available by `perf::test()` API */ #define NTEST/** * gnuplot terminal # see `gnuplot -> set terminal` # default: 'sixel' * - 'sixel' # console image # https://www.arewesixelyet.com * - 'wxt' # popup window * - 'dumb size 150,25 ansi' # console with colors * - 'dumb size 80,25' # console */ ENV:PERF_IO_PLOT_TERM /** * style # default: dark * - light * - dark */ ENV:PERF_IO_PLOT_STYLE
Setup
How to setup
perf
docker?docker build -t perf .
docker run \ -it \ --privileged \ --network=host \ -e DISPLAY=${DISPLAY} \ -v ${PWD}:${PWD} \ -w ${PWD} \ perfHow to install
perf
depenencies?apt-get install linux-tools-common # linux-perf (perf::stat/perf::record) apt-get install llvm-dev # llvm (perf::mc/perf::mca) apt-get install libipt-dev # libipt (perf::trace) apt-get install gnuplot # (perf::plot)How to setup
linux performance counters
?.github/scripts/setup.sh --perf # --rdpmc --max-sample-rate 10000
sudo mount -o remount,mode=755 /sys/kernel/debug sudo mount -o remount,mode=755 /sys/kernel/debug/tracing sudo chown `whoami` /sys/kernel/debug/tracing/uprobe_events sudo chmod a+rw /sys/kernel/debug/tracing/uprobe_events echo 0 | sudo tee /proc/sys/kernel/kptr_restrict echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid echo 1000 | sudo tee /proc/sys/kernel/perf_event_max_sample_rateecho 2 | sudo tee /sys/devices/cpu_core/rdpmcHow to reduce
execution variability
?.github/scripts/tune.sh
pyperf
-pip3 install pyperf
sudo pyperf system tune sudo pyperf system show sudo pyperf system reset# Set Process CPU Affinity (apt install util-linux) taskset -c 0 ./a.out # Set Process Scheduling Priority (apt install coreutils) nice -n -20 taskset -c 0 ./a.out # -20..19 (most..less favorable to the process) # Disable CPU Frequency Scaling (apt install cpufrequtils) sudo cpupower frequency-set --governor performance # cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor # Disable Address Space Randomization echo 0 > /proc/sys/kernel/randomize_va_space # Disable Processor Boosting echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost # Disable Turbo Mode echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo # Disable Hyperthreading/SMT echo off | sudo tee /sys/devices/system/cpu/smt/control # Restrict memory to a single socket numactl -m 0 -N 0 ./a.out # Enable Huge Pages sudo numactl --cpunodebind=1 --membind=1 hugeadm \ --obey-mempolicy --pool-pages-min=1G:64 sudo hugeadm --create-mounts# Enable Kernel Mode Task-Isolation (https://lwn.net/Articles/816298) # cat /sys/devices/system/cpu/isolated isolcpus=<cpu number>,...,<cpu number> # Disable P-states and C-states # cat /sys/devices/system/cpu/intel_pstate/status idle=pool intel_pstate=disable intel_idle.max_cstate=0 processor.max_cstate=1 # Disable NMI watchdog # cat /proc/sys/kernel/nmi_watchdog nmi_watchdog=0
Usage Guide
How to use
perf
withmodules
?clang++ -std=c++23 -O3 -I. --precompile perf.cppm clang++ -std=c++23 -O3 -fprebuilt-module-path=. perf.pcm *.cpp -lLLVM -lipt
import perf;How to change
assembly
syntax?perf::llvm llvm{ {.syntax = perf::arch::syntax::att} // default: intel };How to
disassemble
for a different platform?perf::llvm llvm{ .triple = "x86_64-pc-linux-gnu" // see `llvm-llc` for details };How to write custom
profiler
?struct profiler { // starts profiling constexpr auto start(); // stops profiling constexpr auto stop(); // filter results [[nodiscard]] constexpr auto operator[](Ts...) const; };static_assert(perf::profiler_like<profiler>);
How to integrate with
unit-testing
framework?import perf; import ut; // https://github.com/qlibs/ut int main() { "benchmark"_test = [] { // ... }; }Which
terminal
can display images?Any terminal with sixel support - https://www.arewesixelyet.com
How to plot with
popup windows
?PERF_IO_PLOT_TERM='wxt' ./a.out
How to plot without
sixel
?PERF_IO_PLOT_TERM='dumb' ./a.out PERF_IO_PLOT_TERM='dumb size 80,25' ./a.out PERF_IO_PLOT_TERM='dumb size 150,25 ansi' ./a.outHow to change plot style?
PERF_IO_PLOT_STYLE='dark' ./perf # default PERF_IO_PLOT_STYLE='light' ./perfHow to save plot?
perf::plot::gnuplot plt{{.term = "png"}}; plt.send("set output 'output.png'"); perf::plot::bar(plt, ...);How to
export
results?./a.out 2>&1 | .github/scripts/export.sh markdown > results.md ./a.out 2>&1 | .github/scripts/export.sh notebook > results.ipynb ./a.out 2>&1 | .github/scripts/export.sh html > results.htmlHow to
share
results?
gh
-apt-get install gh
# https://jbt.github.io/markdown-editor gh gist create --public --web results.md
# https://jupyter.org gh gist create --public --web results.ipynb
# https://htmlpreview.github.io gh gist create --public --web results.html
How to integrate with
jupyter
?
jupyter
can be used for data analysis (python)perf::json(...); // save to json file# apt install jupyter jupyter notebook -ip 0.0.0.0 --no-browser notebook.ipynb # read from saved jsonHow
perf
tests are working?
compile-time
tests are executed uponinclude/import
(enabled by default)
run-time/sanity check
tests can be executed at run-timeint main() { perf::test({.verbose = true}); // run-time/sanity check tests }
-DNTEST
can be used to disable tests (not recommended)$CXX -DNTEST ... # tests will NOT be compiled in
perf
tests execution model#ifndef NTEST "perf"_suite = [] { "run-time and compile-time"_test = [] constexpr { expect(3 == accumulate({1, 2, 3}, 0)); }; "run-time"_test = [] mutable { expect(std::rand() >= 0); }; "compile-time"_test = [] consteval { expect(sizeof(int) == sizeof(0)); }; }; #endif
Performance
What are
performance
compilation flags?-O1 # optimizations (O1) [0] -O2 # optimizations (O1 + O2) [0] -O3 # optimizations (O1 + O2 + O3) [0] -march=native # architecture specific [1] -DNDEBUG # disables asserts, etc.-fno-omit-frame-pointer # keeps the frame pointer in a register
-ffast-math # [unsafe] faster but non-conforming math [2] -fcf-protection=none # [unsafe] stops emmitting `endbr64`[0] https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html
[1] https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html
[2] https://gcc.gnu.org/wiki/FloatingPointMathWhat are
performance
compiler attributes?
gnu::target
[[gnu::target("avx2")]] [[gnu::target("bmi")]]
gnu::optimize
[[gnu::optimize("O3")] [[gnu::optimize("ffast-math")]https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html
What is the difference between
latency
andthroughput
?
latency
is the time it takes for a single operation to complete (ns)
throughput
is the total number of operations or tasks completed in a given amount of time (op/s)What is
top-down microarchitecture analysis
method?https://www.intel.com/content/www/us/en/docs/vtune-profiler/cookbook/2023-0/top-down-microarchitecture-analysis-method.html
https://github.com/andikleen/pmu-tools/wiki/toplev-manualHow to integrate
profiling
tools?
prof
supprots the following profilers
linux-perf
-apt get install linux-tools-common
intel-vtune
-apt get install intel-oneapi-vtune
amd-uprof
-https://www.amd.com/en/developer/uprof.html#downloads
gperftools
-apt get install google-perftools
llvm-xray
-apt-get install llvm
callgrind
-apt-get install valgrind
Specification
# System topology (apt install hwloc) lstopo # lstopo-no-graphics # CPU info (apt install util-linux) lscpu | grep -E ^CPU|^Model|^Core|^Socket|^Thread # Cache info lscpu | grep cache getconf -a | grep CACHE_LINESIZE # Numa nodes lscpu | grep -E ^NUMA # Huge pages cat /proc/meminfo | grep -i hugeManuals
- Intel - https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html
- AMD - https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/40332.pdf
- ARM - https://developer.arm.com/documentation/ddi0487/latest
- Apple - https://developer.apple.com/documentation/apple-silicon/cpu-optimization-guide
Books
- Performance Analysis and Tuning on Modern CPUs - https://github.com/dendibakh/perf-book/releases
- The Art of Writing Efficient Programs - https://www.packtpub.com/product/the-art-of-writing-efficient-programs
- Algorithms for Modern Hardware - https://en.algorithmica.org/hpc
- Computer Architecture - https://dl.acm.org/doi/book/10.5555/1999263
- The Art of Assembly Language - https://www.plantation-productions.com/Webster/www.artofasm.com/Linux/HTML/AoATOC.html
- SIMD for C++ Developers - http://const.me/articles/simd/simd.pdf
- Memory Models - https://research.swtch.com/mm
- Data-Oriented Design - https://www.dataorienteddesign.com/dodbook
- Hackers Delight - https://doc.lagout.org/security/Hackers%20Delight.pdf
References
- Performance Monitoring Events - https://perfmon-events.intel.com
- Performance Monitor Counters - https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/programmer-references/58550-0.01.pdf
- Instruction Reference - https://www.felixcloutier.com/x86
- Instruction Matrix - https://github.com/google/highway/blob/master/g3doc/instruction_matrix.pdf
- Instruction Tables: Lists of instruction latencies, throughputs and micro-operation breakdowns for Intel, AMD and VIA CPUs - https://www.agner.org/optimize/instruction_tables.pdf
- Intel Intrinsics - https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
- Opcode and Instruction Reference - http://ref.x86asm.net/
- CPUID data repository - https://x86-cpuid.org
- Intrinsics Cheatsheet - https://db.in.tum.de/~finis/x86-intrin-cheatsheet-v2.1.pdf
- Microarchitecture Cheatsheet - https://docs.google.com/spreadsheets/d/18ln8SKIGRK5_6NymgdB9oLbTJCFwx0iFI-vUs6WFyuE
Guides
- Low Latency Tuning Guide - https://rigtorp.se/low-latency-guide
- Optimizing Software in C++: An Optimization Guide for Windows, Linux and Mac platforms - https://www.agner.org/optimize/optimizing_cpp.pdf
- Optimizing Subroutines in Assembly Language: An Optimization Guide for x86 platforms - https://www.agner.org/optimize/optimizing_assembly.pdf
- Calling Conventions for different C++ compilers and operating systems - https://www.agner.org/optimize/calling_conventions.pdf
- The Microarchitecture of Intel, AMD and VIA CPUs: An Optimization Guide for Assembly programmers and compiler makers - https://www.agner.org/optimize/microarchitecture.pdf
- Is Parallel Programming Hard, And, If So, What Can You Do About It? - https://www.kernel.org/pub/linux/kernel/people/paulmck/perfbook/perfbook.html
- RHEL Performance Guide - https://myllynen.github.io/rhel-performance-guide
- Performance Analysis Methodology - https://www.brendangregg.com/methodology.html
- Top-Down Microarchitecture Analysis Method - https://www.intel.com/content/www/us/en/docs/vtune-profiler/cookbook/2023-0/top-down-microarchitecture-analysis-method.html
- Measuring Workloads with Top-down Microarchitecture Analysis - https://github.com/andikleen/pmu-tools/wiki/toplev-manual
- Active Benchmarking - https://www.brendangregg.com/activebenchmarking.html
- Apple Silicon Guide - https://github.com/mikeroyal/Apple-Silicon-Guide
- Envisioning a Simplified Intel Architecture - https://www.intel.com/content/www/us/en/developer/articles/technical/envisioning-future-simplified-architecture.html
- Monitoring and Managing System Status and Performance - https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/8/html/monitoring_and_managing_system_status_and_performance
- Modern Microprocessors A 90-Minute Guide! - https://www.lighterra.com/papers/modernmicroprocessors
- X3 Low Latency Quickstart - https://docs.amd.com/r/en-US/ug1586-onload-user/X3-Low-Latency-Quickstart
- Processor Information - https://sandpile.org
- SIMD Instruction List - https://www.officedaytime.com/simd512e
- Instruction Discovery And Analysis - https://explore.liblisa.nl
Publications
- What Every Programmer Should Know About Memory - https://www.akkadia.org/drepper/cpumemory.pdf
- Producing wrong data without doing anything obviously wrong! - https://dl.acm.org/doi/10.1145/1508284.1508275
- Robust benchmarking in noisy environments - https://arxiv.org/abs/1608.04295
- nanoBench: A Low-Overhead Tool for Running Microbenchmarks on x86 Systems - https://arxiv.org/abs/1911.03282
- The Linux Scheduler: a Decade of Wasted Cores - https://people.ece.ubc.ca/sasha/papers/eurosys16-final29.pdf
- The Tail At Scale - https://www.barroso.org/publications/TheTailAtScale.pdf
- Can Seqlocks Get Along With Programming Language Memory Models - https://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
- Cache-Oblivious Algorithms and Data Structures - https://erikdemaine.org/papers/BRICS2002
- Memory-Centric Computing: Solving Computing’s Memory Problem - https://arxiv.org/abs/2505.00458
- High-Precision Branch Target Injection Attacks Exploiting the Indirect Branch Predictor - https://indirector.cpusec.org
- A CPU research kernel with minimal noise for cycle-by-cycle micro-architectural introspection - https://gamozolabs.github.io/metrology/2019/08/19/sushi_roll.html
Multimedia
News
- Linux News - https://lwn.net
- Chips and Cheese - https://chipsandcheese.com
- WikiChip - https://wikichip.org
- CPUID - https://www.cpuid.com/news.html
- Real World Tech - https://www.realworldtech.com
- Tom`s Hardware - https://www.tomshardware.com
- Phoronix - https://www.phoronix.com
comp.lang.asm.x86
- https://groups.google.com/g/comp.lang.asm.x86Blogs
- Agner Fog - https://www.agner.org
- Denis Bakhvalov - https://easyperf.net/blog
- Daniel Lemire - https://lemire.me/blog
- Wojciech Mula - http://0x80.pl/articles/index.html
- Erik Rigtorp - https://rigtorp.se
- Brendan Gregg - https://brendangregg.com/blog
- Geoff Langdale - https://branchfree.org
- Ragnar Groot Koerkamp - https://curiouscoding.nl/posts/cpu-benchmarks
- Travis Downs - https://travisdowns.github.io
- Stefanos Baziotis - https://sbaziotis.com/#blog
- Dmitry Vyukov - https://www.1024cores.net
- John Farrier - https://johnfarrier.com
- Tanel Poder - https://tanelpoder.com
- Arseny Kapoulkine - https://zeux.io
- Johnny Software Blog - https://johnnysswlab.com
- JabPerf - https://jabperf.com/blog
- Number World - http://www.numberworld.org
- Gamozo Labs - https://gamozolabs.github.io
- Mechanical Sympathy - https://mechanical-sympathy.blogspot.com
- Performance Engineering - https://pramodkumbhar.com
- Performance Matters - https://thume.ca/archive.html
- Performance Tricks - https://www.performetriks.com/blog
- Coding Confessions - https://blog.codingconfessions.com
- The Netflix Tech - https://netflixtechblog.com
- Cloudflare - https://blog.cloudflare.com
Lists
- C++ Links - https://github.com/MattPD/cpplinks
- Awesome Performance C++ - https://github.com/fenbf/AwesomePerfCpp
- Awesome Lock Free - https://github.com/rigtorp/awesome-lockfree
- Awesome SIMD - https://github.com/awesome-simd/awesome-simd
- Computer, Enhance! - https://www.computerenhance.com
- Low Latency Trading Insights - https://lucisqr.substack.com
Channels
- Computer Architecture - Onur Mutlu - https://www.youtube.com/@OnurMutluLectures
- Computer, Enhance - Casey Muratori - https://www.youtube.com/@MollyRocket
- Assembly - Creel - https://www.youtube.com/c/WhatsACreel
- EasyPerf - Denis Bakhvalov - https://www.youtube.com/@easyperf3992
- SIMD algorithms - Denis Yaroshevskiy - https://www.youtube.com/playlist?list=PLYCMvilhmuPEM8DUvY6Wg_jaSFHpmlSBD
Videos
- Tuning C++: Benchmarks, and CPUs, and Compilers! Oh My! - Chandler Carruth - https://www.youtube.com/watch?v=nXaxk27zwlk
- Counting Nanoseconds Microbenchmarking C++ Code - David Gross - https://www.youtube.com/watch?v=Czr5dBfs72U
- Benchmarking C++ Code - Bryce Adelstein-Lelbach - https://www.youtube.com/watch?v=zWxSZcpeS8Q
- Benchmarking C++, From video games to algorithmic trading - Alexander Radchenko - https://www.youtube.com/watch?v=7YVMC5v4qCA
- Tuning C++: Benchmarks, and CPUs, and Compilers! Oh My! - Chandler Carruth - https://www.youtube.com/watch?v=nXaxk27zwlk
- Going Nowhere Faster - Chandler Carruth - https://www.youtube.com/watch?v=2EWejmkKlxs
- Measurement and Timing - Performance suiteering of Software Systems - https://www.youtube.com/watch?v=LvX3g45ynu8
- How NOT to Measure Latency - Gil Tene - https://www.youtube.com/watch?v=lJ8ydIuPFeU
- From Top-down Microarchitecture Analysis to Structured Performance Optimizationsa - https://cassyni.com/events/YKbqoE4axHCgvQ9vuQq7Cy
- Coz: finding code that counts with causal profiling - ACM - https://www.youtube.com/watch?v=jE0V-p1odPg
- Take Advantage for Intel Instrumentation and Tracing Technology for Performance Analysis - https://www.youtube.com/watch?v=1zdVFLajewM&list=PLg-UKERBljNw3_6Q598CS3DE7KqDXjP-d
- LIKWID Performance Tools - https://www.youtube.com/playlist?list=PLxVedhmuwLq2CqJpAABDMbZG8Whi7pKsk
- Introduction to the Tracy Profiler - Bartosz Taudul - https://youtu.be/fB5B46lbapc
- Performance Matters - Emery Berger - https://www.youtube.com/watch?v=r-TLSBdHe1A
- Understanding the Performance of code using LLVM-MCA - A. Biagio & M. Davis - https://www.youtube.com/watch?v=Ku2D8bjEGXk
- LLVM Optimization Remarks - Ofek Shilon - https://www.youtube.com/watch?v=qmEsx4MbKoc
- Understanding Compiler Optimization - Chandler Carruth - https://www.youtube.com/watch?v=haQ2cijhvhE
- Efficiency with Algorithms, Performance with Data Structures - Chandler Carruth - https://www.youtube.com/watch?v=fHNmRkzxHWs
- Design for Performance - Fedor Pikus - https://www.youtube.com/watch?v=m25p3EtBua4
- Unlocking Modern CPU Power - Next-Gen C++ Optimization Techniques - https://www.youtube.com/watch?v=wGSSUSeaLgA
- Branchless Programming in C++ - Fedor Pikus - https://www.youtube.com/watch?v=g-WPhYREFjk
- CPU design effects - Jakub Beranek - youtube.com/watch?v=ICKIMHCw--Y
- Fastware - Andrei Alexandrescu - https://www.youtube.com/watch?v=o4-CwDo2zpg
- Performance Tuning - Matt Godbolt - https://www.youtube.com/watch?v=fV6qYho-XVs
- Memory & Caches - Matt Godbolt - https://www.youtube.com/watch?v=4_smHyqgDTU
- What Every Programmer Should Know about How CPUs Work - Matt Godbolt - https://www.youtube.com/watch?v=-HNpim5x-IE
- There Are No Zero-cost Abstractions - Chandler Carruth - https://www.youtube.com/watch?v=rHIkrotSwcc&
- Understanding Optimizers: Helping the Compiler Help You - Nir Friedman - https://www.youtube.com/watch?v=8nyq8SNUTSc
- C++ Algorithmic Complexity, Data Locality, Parallelism, Compiler Optimizations, & Some Concurrency - Avi Lachmish - https://www.youtube.com/watch?v=0iXRRCnurvo
- Software Optimizations Become Simple with Top-Down Analysis on Intel Skylake - Ahmad Yasin - https://www.youtube.com/watch?v=kjufVhyuV_A
- Being Friendly to Your Computer Hardware in Software Development - Ignas Bagdonas - https://www.youtube.com/watch?v=eceFgsiPPmk
- Want fast C++? Know your hardware - Timur Doumler - https://www.youtube.com/watch?v=BP6NxVxDQIs
- What is Low Latency C++ - Timur Doumler - https://www.youtube.com/watch?v=EzmNeAhWqVs, https://www.youtube.com/watch?v=5uIsadq-nyk
- Where Have All the Cycles Gone? - Sean Parent - https://www.youtube.com/watch?v=B-aDBB34o6Y
- Understanding CPU Microarchitecture to Increase Performance - https://www.youtube.com/watch?v=rglmJ6Xyj1c
- Performance Analysis & Tuning on Modern CPU - Denis Bakhvalov - https://www.youtube.com/watch?v=Ho3bCIJcMcc
- Comparison of C++ Performance Optimization Techniques for C++ Programmers - Eduardo Madrid - https://www.youtube.com/watch?v=4DQqcRwFXOI
- Simple Code, High Performance - Molly Rocket - https://www.youtube.com/watch?v=Ge3aKEmZcqY
- Assembly, System Calls, and Hardware in C++ - David Sankel - https://www.youtube.com/watch?v=7xwjjolDnwg
- Optimizing Binary Search - Sergey Slotin - https://www.youtube.com/watch?v=1RIPMQQRBWk
- A Deep Dive Into Dispatching Techniques in C++ - Jonathan Muller - https://www.youtube.com/watch?v=vUwsfmVkKtY
- Dive into the general purpose GPU programming - Ashot Vardanian - https://www.youtube.com/watch?v=AA4RI6o0h1U
- C++ Memory Model: from C++11 to C++23 - Alex Dathskovsky - https://www.youtube.com/watch?v=SVEYNEWZLo4
- Abusing Your Memory Model for Fun and Profit - Samy Al Bahra, Paul Khuong - https://www.youtube.com/watch?v=N07tM7xWF1U&t=1s
- The speed of concurrency (is lock-free faster?) - Fedor Pikus - https://www.youtube.com/watch?v=9hJkWwHDDxs
- Read, Copy, Update, then what? RCU for non-kernel programmers - Fedor Pikus - https://www.youtube.com/watch?v=rxQ5K9lo034
- Single Producer Single Consumer Lock-free FIFO From the Ground Up - Charles Frasch - https://www.youtube.com/watch?v=K3P_Lmq6pw0
- Introduction to Hardware Efficiency in Cpp - Ivica Bogosavljevic - https://www.youtube.com/watch?v=Fs_T070H9C8
- The Performance Price of Dynamic Memory in C++ - Ivica Bogosavljevic - https://www.youtube.com/watch?v=LC4jOs6z-ZI
- Kernel Bypass HFT Optimization - https://www.youtube.com/watch?v=FFI9IAy5ZaE
- The Hidden Performance Price of C++ Virtual Functions - Ivica Bogosavljevic - https://www.youtube.com/watch?v=n6PvvE_tEPk
- Why do Programs Get Slower with Time? - Ivica Bogosavljevic - https://www.youtube.com/watch?v=nS5vjnPKX0I
- CPU Cache Effects - Sergey Slotin - https://www.youtube.com/watch?v=mQWuX_KgH00
- Cpu Caches and Why You Care - Scott Meyers - https://www.youtube.com/watch?v=WDIkqP4JbkE
- CPU vs FPGA - https://www.youtube.com/watch?v=BML1YHZpx2o
- Designing for Efficient Cache Usage - Scott McMillan - https://www.youtube.com/watch?v=3-ityWN-FdE
- Cache consistency and the C++ memory model - Yossi Moale - https://www.youtube.com/watch?v=Sa08x_NMZIg
std::simd
: How to Express Inherent Parallelism Efficiently Via Data-parallel Types - Matthias Kretz - https://www.youtube.com/watch?v=LAJ_hywLtMA- The Art of SIMD Programming - Sergey Slotin - https://www.youtube.com/watch?v=vIRjSdTCIEU
- Advanced SIMD Algorithms in Pictures - Denis Yaroshevskiy - https://www.youtube.com/watch?v=vGcH40rkLdA
- Performance Optimization, SIMD and Cache - Sergiy Migdalskiy - https://www.youtube.com/watch?v=Nsf2_Au6KxU
- Data-Oriented Design and C++ - Mike Acton - https://www.youtube.com/watch?v=rX0ItVEVjHc
- Practical Data Oriented Design (DoD) - Andrew Kelley - https://www.youtube.com/watch?v=IroPQ150F6c
- Data Orientation For The Win - Eduardo Madrid - https://www.youtube.com/watch?v=QbffGSgsCcQ
- You Can Do Better than std::unordered_map - Malte Skarupke - https://www.youtube.com/watch?v=M2fKMP47slQ
- Faster than Rust and C++: the PERFECT hash table - https://www.youtube.com/watch?v=DMQ_HcNSOAI
- Designing a Fast, Efficient, Cache-friendly Hash Table, Step by Step - Matt Kulukundis - https://www.youtube.com/watch?v=ncHmEUmJZf4
- C++ Run-Time Optimizations for Compile-Time Reflection - Kris Jusiak - https://www.youtube.com/watch?v=ncHmEUmJZf4 - https://www.youtube.com/watch?v=kCATOctR0BA
- When Nanoseconds Matter: Ultrafast Trading Systems in C++ - David Gross - https://www.youtube.com/watch?v=sX2nF1fW7kI
- When a Microsecond Is an Eternity: High Performance Trading Systems in C++ - Carl Cook - https://www.youtube.com/watch?v=NH1Tta7purM
- The Speed Game: Automated Trading Systems in C++ - Carl Cook - https://www.youtube.com/watch?v=ulOLGX3HNCI
- Low-Latency Trading Systems in C++ - Jason McGuiness - https://www.youtube.com/watch?v=FnMfhWiSweo
- High Frequency Trading and Ultra Low Latency development techniques - Nimrod Sapir - https://www.youtube.com/watch?v=_0aU8S-hFQI
- Trading at light speed: designing low latency systems in C++ - David Gross - https://www.youtube.com/watch?v=8uAW5FQtcvE&list=PLSkBiuVO9yj1MvDkYJ5WOnPeKsoRi3eiW&index=2
- Optimizing Trading Strategies for FPGAs in C/C++ - https://www.youtube.com/watch?v=4Wklh0XS5i0
- C++ Electronic Trading for Cpp Programmers - Mathias Gaunard - https://www.youtube.com/watch?v=ltT2fDqBCEo
- Achieving performance in financial data processing through compile time introspection - Eduardo Madrid - https://www.youtube.com/watch?v=z6fo90R8q5U
- How to Simulate a Low Latency Exchange in C++ - Benjamin Catterall - https://www.youtube.com/watch?v=QQrTE4YLkSE
- Building Low Latency Trading Systems - https://www.youtube.com/watch?v=yBNpSqOOoRk
- Cache Warming: Warm Up The Code - Jonathan Keinan - https://www.youtube.com/watch?v=XzRxikGgaHI
- How Linux Took Over the World of Finance - Christoph H Lameter - https://www.youtube.com/watch?v=UUOM4KdaHkY
Miscellaneous
- Conferences - https://www.p99conf.io, https://supercomputing.org, https://hotchips.org, https://microarch.org
- Podcasts - https://signals-threads.simplecast.com, https://microarch.club, https://tlbh.it, https://twoscomplement.org
- C++ Low Latency Group (SG14) - https://github.com/WG21-SG14/SG14
Workbench
Speed of light ............................ ~1 foot/ns L1 cache reference ......................... 0.5 ns Branch mispredict ............................ 5 ns L2 cache reference ........................... 7 ns Mutex lock/unlock ........................... 25 ns Main memory reference ...................... 100 ns Send 2K bytes over 1 Gbps network ....... 20,000 ns = 20 µs SSD random read ........................ 150,000 ns = 150 µs Read 1 MB sequentially from memory ..... 250,000 ns = 250 µs Round trip within same datacenter ...... 500,000 ns = 0.5 ms Read 1 MB sequentially from SSD ..... 1,000,000 ns = 1 ms Read 1 MB sequentially from disk .... 20,000,000 ns = 20 ms Send packet CA->UK->CA .... 150,000,000 ns = 150 msMetrics
- Latency, Throughput, and Port Usage Information - https://uops.info
- Latency, Memory Latency and CPUID dumps - http://instlatx64.atw.hu
- Memory Latency Data - https://chipsandcheese.com/memory-latency-data
- Core To Core Latency - https://github.com/nviennot/core-to-core-latency
- Operation Costs in CPU Clock Cycles - http://ithare.com/infographics-operation-costs-in-cpu-clock-cycles
- Microarchitecture Metrics - https://dougallj.github.io/applecpu/firestorm.html
- Top-Down Metrics - https://github.com/intel/perfmon/blob/main/TMA_Metrics-full.xlsx
Tutorials
- Performance Ninja Class - https://github.com/dendibakh/perf-ninja
- Hardware Effects - https://github.com/Kobzol/hardware-effects
- Performance Tuning - https://github.com/NAThompson/performance_tuning_tutorial
- Mastering C++ with Google Benchmark - https://ashvardanian.com/posts/google-benchmark
- Learning to Write Less Slow C, C++, and Assembly Code - https://github.com/ashvardanian/less_slow.cpp
- A Top-Down method for performance analysis and counters architecture - https://www.researchgate.net/publication/269302126_A_Top-Down_method_for_performance_analysis_and_counters_architecture
- Measuring Workloads With TopLev - https://github.com/andikleen/pmu-tools/wiki/toplev-manual
- Bits Of Architecture - https://github.com/CoffeeBeforeArch/bits_of_architecture
- Bit Twiddling Hacks - https://graphics.stanford.edu/~seander/bithacks.html
- The Linux Kernel - https://www.kernel.org/doc/html/latest/index.html
gcc
Optimization - https://wiki.gentoo.org/wiki/GCC_optimizationgcc
Assembler Syntax - https://www.felixcloutier.com/documents/gcc-asmllvm
Scheduling Models - https://github.com/llvm/llvm-project/tree/main/llvm/lib/Targetllvm
Optimization Passes - https://llvm.org/docs/Passes.htmlllvm
Vectorizers - https://llvm.org/docs/Vectorizers.htmllinux
Referencer - https://elixir.bootlin.com/linuxBenchmarks
- Phoronix Test Suite - https://www.phoronix-test-suite.com
- CPU benchmark - https://www.cpubenchmark.net
- Geekbench benchmark - https://browser.geekbench.com
- CPU Benchmarks - https://curiouscoding.nl/posts/cpu-benchmarks
- 7-Zip LZMA Benchmark - https://www.7-cpu.com
Tooling
Frameworks
- google-benchmark - https://github.com/google/benchmark / https://quick-bench.com
- nanobench - https://github.com/martinus/nanobench
- celero - https://github.com/DigitalInBlue/Celero
- nanobench - https://github.com/andreas-abel/nanoBench
- uarch-bench - https://github.com/travisdowns/uarch-bench
- llvm-exegesis - https://llvm.org/docs/CommandGuide/llvm-exegesis.html
Profilers
- linux-perf - https://perf.wiki.kernel.org
- intel-vtune - https://www.intel.com/content/www/us/en/docs/vtune-profiler
- amd-uprof - https://www.amd.com/en/developer/uprof.html
- pmu-tools - https://github.com/andikleen/pmu-tools
- perf-tools - https://github.com/brendangregg/perf-tools
- magictrace - https://github.com/janestreet/magic-trace
- tracy - https://github.com/wolfpld/tracy
- likwid - https://github.com/RRZE-HPC/likwid
- coz - https://github.com/plasma-umass/coz
- ebpf - https://ebpf.io
- callgrind - https://valgrind.org/docs/manual/cl-manual.html
- yperf - https://github.com/aayasin/perf-tools
- dtrace - https://www.oracle.com/linux/downloads/linux-dtrace.html
- ftrace - https://www.kernel.org/doc/html/latest/trace/ftrace.html
- utrace - https://github.com/Gui774ume/utrace
- strace - https://strace.io
- omnitrace - https://github.com/ROCm/omnitrace
- optick - https://github.com/bombomby/optick
- easy_profiler - https://github.com/yse/easy_profiler
- gprof - https://ftp.gnu.org/old-gnu/Manuals/gprof-2.9.1/html_mono/gprof.html
- gperftools - https://github.com/gperftools/gperftools
- oprofile - https://oprofile.sourceforge.io
- llvm-opt-report - https://llvm.org/docs/CommandGuide/llvm-opt-report.html
- optview2 - https://github.com/OfekShilon/optview2
- llvm-xray - https://llvm.org/docs/XRay.html
- lttng - https://lttng.org
- bcc - https://github.com/iovisor/bcc
- sysprof - https://www.sysprof.com
Analyzers
- compiler-explorer - https://compiler-explorer.com
- llvm-mca - https://llvm.org/docs/CommandGuide/llvm-mca.html
- osaca - https://github.com/RRZE-HPC/OSACA
- uica - https://uica.uops.info
Optimizers
- clang-pgo - https://clang.llvm.org/docs/UsersManual.html#profile-guided-optimization
- gcc-pgo - https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html
- llvm-bolt - https://github.com/llvm/llvm-project/blob/main/bolt/README.md
- llvm-propelleer - https://github.com/google/llvm-propeller
- autofdo - https://github.com/google/autofdo
- e-graphs - https://egraphs-good.github.io
Utilities
- pyperf - https://github.com/psf/pyperf
- hyperfine - https://github.com/sharkdp/hyperfine
- numatop - https://github.com/intel/numatop
- bpftop - https://github.com/Netflix/bpftop
- pahole - https://github.com/acmel/dwarves
- perfetto - https://perfetto.dev
- speedscope - https://github.com/jlfwong/speedscope
- kcachegrind - https://kcachegrind.sourceforge.net/html/Home.html
- hotspot - https://github.com/KDAB/hotspot
- jupyter - https://jupyter.org
Libraries
- perf-event-open - https://man7.org/linux/man-pages/man2/perf_event_open.2.html
- llvm-dev - https://llvm.org
- perfmon2 - https://perfmon2.sourceforge.net
- papi - https://github.com/icl-utk-edu/papi
- libpfc - https://github.com/obilaniu/libpfc
- intel-pt - https://github.com/intel/libipt
MIT/Apache2+LLVM
license namespace guard description MIT perf::*
- https://opensource.org/license/mit Apache2+LLVM perf::mca::*
PERF_LLVM == 1
https://llvm.org/LICENSE.txt