From 4fe18438e2d0c2a17448cefa1a58d9758b89dabb Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Fri, 19 Apr 2024 13:06:01 +0200 Subject: [PATCH 01/10] Add NCUObserver --- kernel_tuner/core.py | 12 ++++++++-- kernel_tuner/observers/__init__.py | 2 +- kernel_tuner/observers/ncu.py | 36 ++++++++++++++++++++++++++++++ kernel_tuner/observers/observer.py | 11 +++++++++ 4 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 kernel_tuner/observers/ncu.py diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 174cd3af5..1671f2cdc 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -23,7 +23,7 @@ from kernel_tuner.backends.opencl import OpenCLFunctions from kernel_tuner.backends.hip import HipFunctions from kernel_tuner.observers.nvml import NVMLObserver -from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver +from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver, PrologueObserver try: import torch @@ -319,6 +319,7 @@ def __init__( self.use_nvml = False self.continuous_observers = [] self.output_observers = [] + self.prologue_observers = [] if observers: for obs in observers: if isinstance(obs, NVMLObserver): @@ -328,7 +329,8 @@ def __init__( self.continuous_observers.append(obs.continuous_observer) if isinstance(obs, OutputObserver): self.output_observers.append(obs) - + if isinstance(obs, PrologueObserver): + self.prologue_observers.append(obs) self.iterations = iterations @@ -346,6 +348,12 @@ def benchmark_default(self, func, gpu_args, threads, grid, result): obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver) ] + for obs in self.prologue_observers: + obs.before_start() + self.dev.run_kernel(func, gpu_args, threads, grid) + self.dev.synchronize() + obs.after_finish() + self.dev.synchronize() for _ in range(self.iterations): for obs in observers: diff --git a/kernel_tuner/observers/__init__.py b/kernel_tuner/observers/__init__.py index ad27791d5..3036a8483 100644 --- a/kernel_tuner/observers/__init__.py +++ b/kernel_tuner/observers/__init__.py @@ -1 +1 @@ -from .observer import BenchmarkObserver, IterationObserver, ContinuousObserver, OutputObserver +from .observer import BenchmarkObserver, IterationObserver, ContinuousObserver, OutputObserver, PrologueObserver diff --git a/kernel_tuner/observers/ncu.py b/kernel_tuner/observers/ncu.py new file mode 100644 index 000000000..a97fd941f --- /dev/null +++ b/kernel_tuner/observers/ncu.py @@ -0,0 +1,36 @@ +from kernel_tuner.observers import PrologueObserver + +try: + import nvmetrics +except (ImportError): + nvmetrics = None + pass + +class NCUObserver(PrologueObserver): + """``NCUObserver`` measures performance counters. + + """ + + def __init__(self, metrics=None): + """Create a new ``NCUObserver``. + + :param metrics: The metrics to observe. This should be a list of strings. + You can use ``ncu --query-metrics`` to get a list of valid metrics. + """ + + if not nvmetrics: + print("NCUObserver is not available.") + + self.metrics = metrics + self.results = dict() + + def before_start(self): + if nvmetrics: + nvmetrics.measureMetricsStart(self.metrics) + + def after_finish(self): + if nvmetrics: + self.results = nvmetrics.measureMetricsStop() + + def get_results(self): + return dict(zip(self.metrics, self.results)) diff --git a/kernel_tuner/observers/observer.py b/kernel_tuner/observers/observer.py index 493de94f8..21bdb232e 100644 --- a/kernel_tuner/observers/observer.py +++ b/kernel_tuner/observers/observer.py @@ -57,4 +57,15 @@ def process_output(self, answer, output): """ pass +class PrologueObserver(BenchmarkObserver): + """Observer that measures something in a seperate kernel invocation prior to the normal benchmark.""" + @abstractmethod + def before_start(self): + """before start is called before the kernel starts""" + pass + + @abstractmethod + def after_finish(self): + """after finish is called after the kernel has finished execution""" + pass From 2183d136b7277718e39d5c770ef7a2d621451934 Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Mon, 22 Apr 2024 14:01:48 +0200 Subject: [PATCH 02/10] add device option --- kernel_tuner/observers/ncu.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/observers/ncu.py b/kernel_tuner/observers/ncu.py index a97fd941f..2ef60ecb8 100644 --- a/kernel_tuner/observers/ncu.py +++ b/kernel_tuner/observers/ncu.py @@ -11,7 +11,7 @@ class NCUObserver(PrologueObserver): """ - def __init__(self, metrics=None): + def __init__(self, metrics=None, device=0): """Create a new ``NCUObserver``. :param metrics: The metrics to observe. This should be a list of strings. @@ -22,11 +22,12 @@ def __init__(self, metrics=None): print("NCUObserver is not available.") self.metrics = metrics + self.device = device self.results = dict() def before_start(self): if nvmetrics: - nvmetrics.measureMetricsStart(self.metrics) + nvmetrics.measureMetricsStart(self.metrics, self.device) def after_finish(self): if nvmetrics: From 225dae7902d4d1c54c34023406a5e79d5aca31c6 Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Mon, 22 Apr 2024 14:22:34 +0200 Subject: [PATCH 03/10] add example for NCUObserver --- examples/cuda/vector_add_ncuobserver.py | 57 +++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 examples/cuda/vector_add_ncuobserver.py diff --git a/examples/cuda/vector_add_ncuobserver.py b/examples/cuda/vector_add_ncuobserver.py new file mode 100644 index 000000000..589420a3f --- /dev/null +++ b/examples/cuda/vector_add_ncuobserver.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +"""This is the minimal example from the README""" +import json + +import numpy +from kernel_tuner import tune_kernel +from kernel_tuner.observers.ncu import NCUObserver + +def tune(): + + kernel_string = """ + __global__ void vector_add(float *c, float *a, float *b, int n) { + int i = blockIdx.x * block_size_x + threadIdx.x; + if (i Date: Mon, 22 Apr 2024 14:40:00 +0200 Subject: [PATCH 04/10] expand documentation --- doc/source/observers.rst | 8 ++++++++ kernel_tuner/observers/ncu.py | 16 +++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/doc/source/observers.rst b/doc/source/observers.rst index 174e6a01a..df4013734 100644 --- a/doc/source/observers.rst +++ b/doc/source/observers.rst @@ -112,3 +112,11 @@ More information about PMT can be found here: https://git.astron.nl/RD/pmt/ +NCUObserver +~~~~~~~~~~~ + +The NCUObserver can be used to automatically extract performance counters during tuning using Nvidia's NsightCompute profiler. +The NCUObserver relies on an intermediate library, which can be found here: https://github.com/nlesc-recruit/nvmetrics + +.. autoclass:: kernel_tuner.observers.ncu.NCUObserver + diff --git a/kernel_tuner/observers/ncu.py b/kernel_tuner/observers/ncu.py index 2ef60ecb8..3a808588f 100644 --- a/kernel_tuner/observers/ncu.py +++ b/kernel_tuner/observers/ncu.py @@ -4,20 +4,26 @@ import nvmetrics except (ImportError): nvmetrics = None - pass class NCUObserver(PrologueObserver): """``NCUObserver`` measures performance counters. - """ + The exact performance counters supported differ per GPU, some examples: - def __init__(self, metrics=None, device=0): - """Create a new ``NCUObserver``. + * "dram__bytes.sum", # Counter byte # of bytes accessed in DRAM + * "dram__bytes_read.sum", # Counter byte # of bytes read from DRAM + * "dram__bytes_write.sum", # Counter byte # of bytes written to DRAM + * "smsp__sass_thread_inst_executed_op_fadd_pred_on.sum", # Counter inst # of FADD thread instructions executed where all predicates were true + * "smsp__sass_thread_inst_executed_op_ffma_pred_on.sum", # Counter inst # of FFMA thread instructions executed where all predicates were true + * "smsp__sass_thread_inst_executed_op_fmul_pred_on.sum", # Counter inst # of FMUL thread instructions executed where all predicates were true :param metrics: The metrics to observe. This should be a list of strings. You can use ``ncu --query-metrics`` to get a list of valid metrics. - """ + :type metrics: list[str] + """ + + def __init__(self, metrics=None, device=0): if not nvmetrics: print("NCUObserver is not available.") From 3602286ce8a19ea5ecdbe35e7738ba52de1a3e7b Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Tue, 23 Apr 2024 11:29:16 +0200 Subject: [PATCH 05/10] Raise exception when using NCUObserver without nvmetrics --- kernel_tuner/observers/ncu.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel_tuner/observers/ncu.py b/kernel_tuner/observers/ncu.py index 3a808588f..6a4cc4b4e 100644 --- a/kernel_tuner/observers/ncu.py +++ b/kernel_tuner/observers/ncu.py @@ -25,19 +25,17 @@ class NCUObserver(PrologueObserver): def __init__(self, metrics=None, device=0): if not nvmetrics: - print("NCUObserver is not available.") + raise Exception("NCUObserver is not available.") self.metrics = metrics self.device = device self.results = dict() def before_start(self): - if nvmetrics: - nvmetrics.measureMetricsStart(self.metrics, self.device) + nvmetrics.measureMetricsStart(self.metrics, self.device) def after_finish(self): - if nvmetrics: - self.results = nvmetrics.measureMetricsStop() + self.results = nvmetrics.measureMetricsStop() def get_results(self): return dict(zip(self.metrics, self.results)) From 397306f22da7f892fb4a36c101db5bb0243efc06 Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Tue, 23 Apr 2024 11:30:08 +0200 Subject: [PATCH 06/10] Rename example to vector_add_observers_ncu --- .../{vector_add_ncuobserver.py => vector_add_observers_ncu.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/cuda/{vector_add_ncuobserver.py => vector_add_observers_ncu.py} (100%) diff --git a/examples/cuda/vector_add_ncuobserver.py b/examples/cuda/vector_add_observers_ncu.py similarity index 100% rename from examples/cuda/vector_add_ncuobserver.py rename to examples/cuda/vector_add_observers_ncu.py From 3402808618be7b9cd59019da54a54f0166d182b0 Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Wed, 24 Apr 2024 08:22:54 +0200 Subject: [PATCH 07/10] Rename to prologue_start and prologue_finish --- kernel_tuner/core.py | 4 ++-- kernel_tuner/observers/ncu.py | 4 ++-- kernel_tuner/observers/observer.py | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 1671f2cdc..0eba8e85d 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -349,10 +349,10 @@ def benchmark_default(self, func, gpu_args, threads, grid, result): ] for obs in self.prologue_observers: - obs.before_start() + obs.prologue_start() self.dev.run_kernel(func, gpu_args, threads, grid) self.dev.synchronize() - obs.after_finish() + obs.prologue_finish() self.dev.synchronize() for _ in range(self.iterations): diff --git a/kernel_tuner/observers/ncu.py b/kernel_tuner/observers/ncu.py index 6a4cc4b4e..a58f54daf 100644 --- a/kernel_tuner/observers/ncu.py +++ b/kernel_tuner/observers/ncu.py @@ -31,10 +31,10 @@ def __init__(self, metrics=None, device=0): self.device = device self.results = dict() - def before_start(self): + def prologue_start(self): nvmetrics.measureMetricsStart(self.metrics, self.device) - def after_finish(self): + def prologue_finish(self): self.results = nvmetrics.measureMetricsStop() def get_results(self): diff --git a/kernel_tuner/observers/observer.py b/kernel_tuner/observers/observer.py index 21bdb232e..99f8de037 100644 --- a/kernel_tuner/observers/observer.py +++ b/kernel_tuner/observers/observer.py @@ -61,11 +61,11 @@ class PrologueObserver(BenchmarkObserver): """Observer that measures something in a seperate kernel invocation prior to the normal benchmark.""" @abstractmethod - def before_start(self): - """before start is called before the kernel starts""" + def prologue_start(self): + """prologue start is called before the kernel starts""" pass @abstractmethod - def after_finish(self): - """after finish is called after the kernel has finished execution""" + def prologue_finish(self): + """prologue finish is called after the kernel has finished execution""" pass From 35910f55c12123b76d13399b320f7cc4333e2fbb Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Wed, 24 Apr 2024 10:41:22 +0200 Subject: [PATCH 08/10] suggestion to organize prologue observers differently --- kernel_tuner/core.py | 39 +++++++++++++++++------------- kernel_tuner/observers/ncu.py | 4 +-- kernel_tuner/observers/observer.py | 4 +-- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 0eba8e85d..b453400b5 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -314,6 +314,7 @@ def __init__( ) else: raise ValueError("Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet") + self.dev = dev # look for NVMLObserver in observers, if present, enable special tunable parameters through nvml self.use_nvml = False @@ -332,53 +333,58 @@ def __init__( if isinstance(obs, PrologueObserver): self.prologue_observers.append(obs) + # Take list of observers from self.dev because Backends tend to add their own observer + self.benchmark_observers = [ + obs for obs in self.dev.observers if not isinstance(obs, (ContinuousObserver, PrologueObserver)) + ] + self.iterations = iterations self.lang = lang - self.dev = dev self.units = dev.units self.name = dev.name self.max_threads = dev.max_threads if not quiet: print("Using: " + self.dev.name) - def benchmark_default(self, func, gpu_args, threads, grid, result): - """Benchmark one kernel execution at a time""" - observers = [ - obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver) - ] + def benchmark_prologue(self, func, gpu_args, threads, grid, result): + """Benchmark prologue one kernel execution per PrologueObserver""" for obs in self.prologue_observers: - obs.prologue_start() + self.dev.synchronize() + obs.before_start() self.dev.run_kernel(func, gpu_args, threads, grid) self.dev.synchronize() - obs.prologue_finish() + obs.after_finish() + result.update(obs.get_results()) + + def benchmark_default(self, func, gpu_args, threads, grid, result): + """Benchmark one kernel execution for 'iterations' at a time""" self.dev.synchronize() for _ in range(self.iterations): - for obs in observers: + for obs in self.benchmark_observers: obs.before_start() self.dev.synchronize() self.dev.start_event() self.dev.run_kernel(func, gpu_args, threads, grid) self.dev.stop_event() - for obs in observers: + for obs in self.benchmark_observers: obs.after_start() while not self.dev.kernel_finished(): - for obs in observers: + for obs in self.benchmark_observers: obs.during() time.sleep(1e-6) # one microsecond self.dev.synchronize() - for obs in observers: + for obs in self.benchmark_observers: obs.after_finish() - for obs in observers: + for obs in self.benchmark_observers: result.update(obs.get_results()) def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration): """Benchmark continuously for at least 'duration' seconds""" iterations = int(np.ceil(duration / (result["time"] / 1000))) - # print(f"{iterations=} {(result['time']/1000)=}") self.dev.synchronize() for obs in self.continuous_observers: obs.before_start() @@ -423,9 +429,8 @@ def benchmark(self, func, gpu_args, instance, verbose, objective): result = {} try: - self.benchmark_default( - func, gpu_args, instance.threads, instance.grid, result - ) + self.benchmark_prologue(func, gpu_args, instance.threads, instance.grid, result) + self.benchmark_default(func, gpu_args, instance.threads, instance.grid, result) if self.continuous_observers: duration = 1 diff --git a/kernel_tuner/observers/ncu.py b/kernel_tuner/observers/ncu.py index a58f54daf..6a4cc4b4e 100644 --- a/kernel_tuner/observers/ncu.py +++ b/kernel_tuner/observers/ncu.py @@ -31,10 +31,10 @@ def __init__(self, metrics=None, device=0): self.device = device self.results = dict() - def prologue_start(self): + def before_start(self): nvmetrics.measureMetricsStart(self.metrics, self.device) - def prologue_finish(self): + def after_finish(self): self.results = nvmetrics.measureMetricsStop() def get_results(self): diff --git a/kernel_tuner/observers/observer.py b/kernel_tuner/observers/observer.py index 99f8de037..ec13bed69 100644 --- a/kernel_tuner/observers/observer.py +++ b/kernel_tuner/observers/observer.py @@ -61,11 +61,11 @@ class PrologueObserver(BenchmarkObserver): """Observer that measures something in a seperate kernel invocation prior to the normal benchmark.""" @abstractmethod - def prologue_start(self): + def before_start(self): """prologue start is called before the kernel starts""" pass @abstractmethod - def prologue_finish(self): + def after_finish(self): """prologue finish is called after the kernel has finished execution""" pass From 7bb58c3f43dd355e61a7d7e67d31c24fb9a8ce7e Mon Sep 17 00:00:00 2001 From: Floris-Jan Willemsen Date: Wed, 24 Apr 2024 14:01:48 +0200 Subject: [PATCH 09/10] Update test-python-package.yml --- .github/workflows/test-python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-python-package.yml b/.github/workflows/test-python-package.yml index e4ed81c8a..c0541298b 100644 --- a/.github/workflows/test-python-package.yml +++ b/.github/workflows/test-python-package.yml @@ -28,7 +28,7 @@ jobs: - name: Setup Nox uses: fjwillemsen/setup-nox2@v3.0.0 - name: Setup Poetry - uses: Gr1N/setup-poetry@v8 + uses: Gr1N/setup-poetry@v9 - run: poetry self add poetry-plugin-export - name: Run tests with Nox run: | From 742faa4210157e709e303916b20ac7e3a9c4da6e Mon Sep 17 00:00:00 2001 From: Bram Veenboer Date: Fri, 26 Apr 2024 08:53:55 +0200 Subject: [PATCH 10/10] Raise error like in PMTObserver --- kernel_tuner/observers/ncu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/observers/ncu.py b/kernel_tuner/observers/ncu.py index 6a4cc4b4e..c727e1e30 100644 --- a/kernel_tuner/observers/ncu.py +++ b/kernel_tuner/observers/ncu.py @@ -25,7 +25,7 @@ class NCUObserver(PrologueObserver): def __init__(self, metrics=None, device=0): if not nvmetrics: - raise Exception("NCUObserver is not available.") + raise ImportError("could not import nvmetrics") self.metrics = metrics self.device = device