Skip to content

Commit

Permalink
Merge pull request #197 from KernelTuner/openacc
Browse files Browse the repository at this point in the history
Merging OpenACC tuning support, and fixing Issue #193 in the process.
  • Loading branch information
isazi authored Oct 4, 2023
2 parents 9955ab3 + 4f3dab7 commit 5b2cb6e
Show file tree
Hide file tree
Showing 18 changed files with 1,209 additions and 269 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,6 @@ temp_*.*
.DS_Store
.AppleDouble
.LSOverride

.vscode
.idea
67 changes: 67 additions & 0 deletions examples/c/vector_add_openacc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env python
"""This is a simple example for tuning C++ OpenACC code with the kernel tuner"""

from kernel_tuner import tune_kernel
from kernel_tuner.utils.directives import (
extract_directive_signature,
extract_directive_code,
extract_preprocessor,
generate_directive_function,
extract_directive_data,
allocate_signature_memory,
)
from collections import OrderedDict

code = """
#include <stdlib.h>
#define VECTOR_SIZE 65536
int main(void) {
int size = VECTOR_SIZE;
float * a = (float *) malloc(VECTOR_SIZE * sizeof(float));
float * b = (float *) malloc(VECTOR_SIZE * sizeof(float));
float * c = (float *) malloc(VECTOR_SIZE * sizeof(float));
#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)
#pragma acc parallel num_gangs(ngangs) vector_length(nthreads)
#pragma acc loop
for ( int i = 0; i < size; i++ ) {
c[i] = a[i] + b[i];
}
#pragma tuner stop
free(a);
free(b);
free(c);
}
"""

# Extract tunable directive and generate kernel_string
preprocessor = extract_preprocessor(code)
signature = extract_directive_signature(code)
body = extract_directive_code(code)
kernel_string = generate_directive_function(
preprocessor, signature["vector_add"], body["vector_add"]
)

# Allocate memory on the host
data = extract_directive_data(code)
args = allocate_signature_memory(data["vector_add"], preprocessor)

tune_params = OrderedDict()
tune_params["ngangs"] = [2**i for i in range(0, 15)]
tune_params["nthreads"] = [2**i for i in range(0, 11)]

answer = [None, None, args[0] + args[1], None]

tune_kernel(
"vector_add",
kernel_string,
0,
args,
tune_params,
answer=answer,
compiler_options=["-fast", "-acc=gpu"],
compiler="nvc++",
)
18 changes: 13 additions & 5 deletions examples/fortran/test_fortran_vector_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
import numpy as np
from kernel_tuner import run_kernel

def test():

def test():
filename = Path(__file__).parent / "vector_add.F90"
with open(filename, 'r') as f:
with open(filename, "r") as f:
kernel_string = f.read()

size = 10000000
Expand All @@ -25,9 +25,17 @@ def test():
tune_params["N"] = size
tune_params["NTHREADS"] = 4

answer = run_kernel("vector_add", kernel_string, size, args, tune_params, lang="fortran", compiler="gfortran")

assert np.allclose(answer[0], a+b, atol=1e-8)
answer = run_kernel(
"vector_add",
kernel_string,
size,
args,
tune_params,
lang="fortran",
compiler="gfortran",
)

assert np.allclose(answer[0], a + b, atol=1e-8)


if __name__ == "__main__":
Expand Down
12 changes: 8 additions & 4 deletions examples/fortran/vector_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@


def tune():

size = int(80e6)

a = np.random.randn(size).astype(np.float32)
Expand All @@ -23,9 +22,14 @@ def tune():

print("compile with gfortran")
result, _ = tune_kernel(
"time_vector_add", "vector_add.F90", size,
args, tune_params, lang="C", compiler="gfortran"
)
"time_vector_add",
"vector_add.F90",
size,
args,
tune_params,
lang="C",
compiler="gfortran",
)

return result

Expand Down
6 changes: 3 additions & 3 deletions examples/fortran/vector_add_acc.F90
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ subroutine vector_add(C, A, B, n)
use iso_c_binding
real (c_float), intent(out), dimension(N) :: C
real (c_float), intent(in), dimension(N) :: A, B
integer (c_int), intent(in) :: n
integer (c_int), value, intent(in) :: n

!$acc data copyin(A, B) copyout(C)

!$acc parallel loop device_type(nvidia) vector_length(block_size_x)
do i = 1, N
do i = 1, n
C(i) = A(i) + B(i)
end do
!$acc end parallel loop
Expand All @@ -35,7 +35,7 @@ function time_vector_add(C, A, B, n) result(time)
use iso_c_binding
real (c_float), intent(out), dimension(N) :: C
real (c_float), intent(in), dimension(N) :: A, B
integer (c_int), intent(in) :: n
integer (c_int), value, intent(in) :: n
real (c_float) :: time
real (c_double) start_time, end_time

Expand Down
20 changes: 13 additions & 7 deletions examples/fortran/vector_add_acc.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
#!/usr/bin/env python
"""This is a minimal example for calling Fortran functions"""

import logging
import json
import numpy as np
from kernel_tuner import tune_kernel

def tune():

size = int(72*1024*1024)
def tune():
size = int(72 * 1024 * 1024)

a = np.random.randn(size).astype(np.float32)
b = np.random.randn(size).astype(np.float32)
Expand All @@ -21,11 +19,19 @@ def tune():
tune_params["N"] = [size]
tune_params["block_size_x"] = [32, 64, 128, 256, 512]

result, env = tune_kernel("time_vector_add", "vector_add_acc.F90", size, args,
tune_params, lang="C", compiler="pgfortran",
compiler_options=["-acc=verystrict", "-ta=tesla,lineinfo"])
result, env = tune_kernel(
"time_vector_add",
"vector_add_acc.F90",
size,
args,
tune_params,
lang="C",
compiler="nvfortran",
compiler_options=["-fast", "-acc=gpu"],
)

return result


if __name__ == "__main__":
tune()
62 changes: 62 additions & 0 deletions examples/fortran/vector_add_openacc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python
"""This is a simple example for tuning Fortran OpenACC code with the kernel tuner"""

from kernel_tuner import tune_kernel
from kernel_tuner.utils.directives import (
extract_directive_signature,
extract_directive_code,
extract_preprocessor,
generate_directive_function,
extract_directive_data,
allocate_signature_memory,
)
from collections import OrderedDict

code = """
#define VECTOR_SIZE 65536
subroutine vector_add(A, B, C, n)
use iso_c_binding
real (c_float), intent(out), dimension(VECTOR_SIZE) :: C
real (c_float), intent(in), dimension(VECTOR_SIZE) :: A, B
integer (c_int), intent(in) :: n
!$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE)
!$acc parallel loop num_gangs(ngangs) vector_length(vlength)
do i = 1, n
C(i) = A(i) + B(i)
end do
!$acc end parallel loop
!$tuner stop
end subroutine vector_add
"""

# Extract tunable directive and generate kernel_string
preprocessor = extract_preprocessor(code)
signature = extract_directive_signature(code)
body = extract_directive_code(code)
kernel_string = generate_directive_function(
preprocessor, signature["vector_add"], body["vector_add"]
)

# Allocate memory on the host
data = extract_directive_data(code)
args = allocate_signature_memory(data["vector_add"], preprocessor)

tune_params = OrderedDict()
tune_params["ngangs"] = [2**i for i in range(0, 15)]
tune_params["vlength"] = [2**i for i in range(0, 11)]

answer = [None, None, args[0] + args[1], None]

tune_kernel(
"vector_add",
kernel_string,
0,
args,
tune_params,
answer=answer,
compiler_options=["-fast", "-acc=gpu"],
compiler="nvfortran",
)
46 changes: 29 additions & 17 deletions kernel_tuner/backends/c.py → kernel_tuner/backends/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import numpy.ctypeslib

from kernel_tuner.backends.backend import CompilerBackend
from kernel_tuner.observers.c import CRuntimeObserver
from kernel_tuner.observers.compiler import CompilerRuntimeObserver
from kernel_tuner.util import (
get_temp_filename,
delete_temp_file,
Expand All @@ -40,7 +40,7 @@
Argument = namedtuple("Argument", ["numpy", "ctypes"])


class CFunctions(CompilerBackend):
class CompilerFunctions(CompilerBackend):
"""Class that groups the code for running and compiling C functions"""

def __init__(self, iterations=7, compiler_options=None, compiler=None, observers=None):
Expand All @@ -50,7 +50,7 @@ def __init__(self, iterations=7, compiler_options=None, compiler=None, observers
:type iterations: int
"""
self.observers = observers or []
self.observers.append(CRuntimeObserver(self))
self.observers.append(CompilerRuntimeObserver(self))

self.iterations = iterations
self.max_threads = 1024
Expand All @@ -59,13 +59,24 @@ def __init__(self, iterations=7, compiler_options=None, compiler=None, observers
self.compiler = compiler or "g++"
self.lib = None
self.using_openmp = False
self.using_openacc = False
self.observers = [CompilerRuntimeObserver(self)]
self.last_result = None

try:
cc_version = str(subprocess.check_output([self.compiler, "--version"]))
cc_version = cc_version.splitlines()[0].split(" ")[-1]
except OSError as e:
raise e
if self.compiler == "g++":
try:
cc_version = str(subprocess.check_output([self.compiler, "--version"]))
cc_version = cc_version.split("\\n")[0].split(" ")[2]
except OSError as e:
raise e
elif self.compiler in ["nvc", "nvc++", "nvfortran"]:
try:
cc_version = str(subprocess.check_output([self.compiler, "--version"]))
cc_version = cc_version.split(" ")[1]
except OSError as e:
raise e
else:
cc_version = None

# check if nvcc is available
self.nvcc_available = False
Expand Down Expand Up @@ -145,13 +156,14 @@ def compile(self, kernel_instance):
if "#include <omp.h>" in kernel_string or "use omp_lib" in kernel_string:
logging.debug("set using_openmp to true")
self.using_openmp = True
if self.compiler == "pgfortran":
if self.compiler in ["nvc", "nvc++", "nvfortran"]:
compiler_options.append("-mp")
else:
if "#pragma acc" in kernel_string or "!$acc" in kernel_string:
compiler_options.append("-fopenacc")
else:
compiler_options.append("-fopenmp")
compiler_options.append("-fopenmp")

# detect openacc
if "#pragma acc" in kernel_string or "!$acc" in kernel_string:
self.using_openacc = True

# if filename is known, use that one
suffix = kernel_instance.kernel_source.get_user_suffix()
Expand All @@ -177,7 +189,7 @@ def compile(self, kernel_instance):
# select right suffix based on compiler
suffix = ".cc"

if self.compiler in ["gfortran", "pgfortran", "ftn", "ifort"]:
if self.compiler in ["gfortran", "nvfortran", "ftn", "ifort"]:
suffix = ".F90"

if self.compiler == "nvcc":
Expand Down Expand Up @@ -210,11 +222,11 @@ def compile(self, kernel_instance):
kernel_name = "__" + match.group(1) + "_MOD_" + kernel_name
elif self.compiler in ["ftn", "ifort"]:
kernel_name = match.group(1) + "_mp_" + kernel_name + "_"
elif self.compiler == "pgfortran":
elif self.compiler == "nvfortran":
kernel_name = match.group(1) + "_" + kernel_name + "_"
else:
# for functions outside of modules
if self.compiler in ["gfortran", "ftn", "ifort", "pgfortran"]:
if self.compiler in ["gfortran", "ftn", "ifort", "nvfortran"]:
kernel_name = kernel_name + "_"

try:
Expand Down Expand Up @@ -340,7 +352,7 @@ def memcpy_htod(self, dest, src):

def cleanup_lib(self):
"""unload the previously loaded shared library"""
if not self.using_openmp:
if not self.using_openmp and not self.using_openacc:
# this if statement is necessary because shared libraries that use
# OpenMP will core dump when unloaded, this is a well-known issue with OpenMP
logging.debug("unloading shared library")
Expand Down
Loading

0 comments on commit 5b2cb6e

Please sign in to comment.