@@ -340,14 +340,62 @@ def __init__(
340340 if not quiet :
341341 print ("Using: " + self .dev .name )
342342
343- def benchmark_default (self , func , gpu_args , threads , grid , result ):
344- """Benchmark one kernel execution at a time"""
343+ if lang .upper () not in ['OPENCL' , 'C' , 'FORTRAN' ]:
344+ # flush the L2 cache, inspired by https://github.com/pytorch/FBGEMM/blob/eb3c304e6c213b81f2b2077813d3c6d16597aa97/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu#L130
345+ flush_gpu_string = """
346+ __global__ void flush_gpu(char* d_flush, char* d_flush2, bool do_write) {
347+ const int idx = blockIdx.x * blockDim.x + threadIdx.x;
348+ const char val = d_flush[idx];
349+ if (do_write * val) {
350+ d_flush2[idx] = val;
351+ }
352+ }
353+ """
354+ cache_size = self .dev .cache_size_L2
355+ d_flush = np .ones ((cache_size ), order = 'F' ).astype (np .float32 )
356+ d_flush2 = np .ones ((cache_size ), order = 'F' ).astype (np .float32 )
357+ self .flush_kernel_gpu_args = [d_flush , d_flush2 , np .int32 (True )]
358+
359+ from kernel_tuner .interface import Options
360+ options = {
361+ 'kernel_name' : 'flush_gpu' ,
362+ 'lang' : 'CUDA' ,
363+ 'arguments' : self .flush_kernel_gpu_args ,
364+ 'problem_size' : cache_size ,
365+ 'grid_div_x' : None ,
366+ 'grid_div_y' : None ,
367+ 'grid_div_z' : None ,
368+ 'block_size_names' : None ,
369+ }
370+ options = Options (options )
371+ flush_kernel_lang = lang .upper () if lang .upper () in ['CUDA' , 'CUPY' , 'NVCUDA' ] else 'CUPY'
372+ flush_kernel_source = KernelSource ('flush_gpu' , flush_gpu_string , flush_kernel_lang )
373+ self .flush_kernel_instance = self .create_kernel_instance (flush_kernel_source , kernel_options = options , params = dict (), verbose = not quiet )
374+ self .flush_kernel = self .compile_kernel (self .flush_kernel_instance , verbose = not quiet )
375+ self .flush_kernel_gpu_args = self .ready_argument_list (self .flush_kernel_gpu_args )
376+
377+ # from kernel_tuner.kernelbuilder import PythonKernel
378+ # self.flush_kernel = PythonKernel('flush_gpu', flush_gpu_string, cache_size, self.flush_kernel_gpu_args)
379+
380+ def flush_cache (self ):
381+ """This special function can be called to flush the L2 cache."""
382+ if hasattr (self , 'flush_kernel' ):
383+ return
384+ self .dev .synchronize ()
385+ assert self .run_kernel (self .flush_kernel , self .flush_kernel_gpu_args , self .flush_kernel_instance )
386+ # self.flush_kernel.run_kernel(self.flush_kernel.gpu_args)
387+ self .dev .synchronize ()
388+
389+ def benchmark_default (self , func , gpu_args , threads , grid , result , flush_cache = True ):
390+ """Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations."""
345391 observers = [
346392 obs for obs in self .dev .observers if not isinstance (obs , ContinuousObserver )
347393 ]
348394
349395 self .dev .synchronize ()
350396 for _ in range (self .iterations ):
397+ if flush_cache :
398+ self .flush_cache ()
351399 for obs in observers :
352400 obs .before_start ()
353401 self .dev .synchronize ()
@@ -1008,3 +1056,4 @@ def wrap_templated_kernel(kernel_string, kernel_name):
10081056 new_kernel_string += wrapper_function
10091057
10101058 return new_kernel_string , name + "_wrapper"
1059+