|
1 |
| -#Just testing the docker-py SDK |
| 1 | + |
| 2 | +import os |
| 3 | +from subprocess import check_output |
| 4 | +import re |
2 | 5 | import docker
|
3 | 6 |
|
4 | 7 | class NVDockerClient:
|
5 |
| - client = None |
6 |
| - |
7 |
| - gpu_devices = ['/dev/nvidiactl', '/dev/nvidia-uvm', '/dev/nvidia1', '/dev/nvidia0'] |
8 |
| - nvidia_driver = 'nvidia-docker' |
9 |
| - volumes = {'nvidia_driver_387.12':{'bind':'/usr/local/nvidia', 'mode':'ro'}, |
10 |
| - '/vault': {'bind':'/vault', 'mode':'rw'}} |
11 |
| - ports = {'8888/tcp':8890, |
12 |
| - '6006/tcp':6969} |
13 |
| - |
| 8 | + |
14 | 9 | def __init__(self):
|
15 |
| - self.client = docker.from_env(version='auto') |
| 10 | + self.docker_client = docker.from_env(version="auto") |
| 11 | + |
| 12 | + #TODO: Testing on MultiGPU |
| 13 | + def create_container(self, image, **kwargs): |
| 14 | + #defaults |
| 15 | + config = { |
| 16 | + "auto_remove":False, |
| 17 | + "detach":True |
| 18 | + } |
| 19 | + environment = {} |
| 20 | + for arg in kwargs: |
| 21 | + if arg == "driver_capabilities": |
| 22 | + environment["NVIDIA_DRIVER_CAPABILITIES"] = kwargs["driver_capabilities"] |
| 23 | + elif arg == "visible_devices" in kwargs: |
| 24 | + vis_devices = "" |
| 25 | + if type(kwargs["visible_devices"]) is list: |
| 26 | + if len(kwargs["visible_devices"]) == 1: |
| 27 | + vis_devices = str(kwargs["visible_devices"][0]) |
| 28 | + else: |
| 29 | + for dev in kwargs["visible_devices"]: |
| 30 | + vis_devices += dev + ',' |
| 31 | + vis_devices = vis_devices[:-1] |
| 32 | + elif type(kwargs["visible_devices"]) is str: |
| 33 | + vis_devices = kwargs["visible_device"] |
| 34 | + elif type(kwargs["visible_devices"]) is int: |
| 35 | + vis_devices = str(kwargs["visible_devices"]) |
| 36 | + environment["NVIDIA_VISIBLE_DEVICES"] = vis_devices |
| 37 | + elif arg == "disable_require" in kwargs: |
| 38 | + environment["NVIDIA_DISABLE_REQUIRE"] = kwargs["disable_require"] |
| 39 | + elif arg == "require": |
| 40 | + if "cuda" in kwargs["require"]: |
| 41 | + environment["NVIDIA_REQUIRE_CUDA"] = kwargs["require"]["cuda"] |
| 42 | + if "driver" in kwargs["require"]: |
| 43 | + environment["NVIDIA_REQUIRE_DRIVER"] = kwargs["require"]["driver"] |
| 44 | + if "arch" in kwargs["require"]: |
| 45 | + environment["NVIDIA_REQUIRE_ARCH"] = kwargs["require"]["arch"] |
| 46 | + elif arg == "cuda_version": |
| 47 | + print("WARNING: the CUDA_VERSION enviorment variable is a legacy variable, consider moving to NVIDIA_REQUIRE_CUDA") |
| 48 | + environment["CUDA_VERSION"] = kwargs["cuda_version"] |
| 49 | + elif arg == "environment": |
| 50 | + if type(kwargs["environment"]) is dict: |
| 51 | + for k,v in kwargs["environment"]: |
| 52 | + environment[k] = v |
| 53 | + elif type(kwargs["environment"]) is list: |
| 54 | + for e in kwargs["environment"]: |
| 55 | + kv = e.split("=") |
| 56 | + assert(len(kv) == 2), "Does not follow the format SOMEVAR=xxx" |
| 57 | + environment[kv[0]] = kv[1] |
| 58 | + else: |
| 59 | + config[arg] = kwargs[arg] |
| 60 | + config["environment"] = environment |
| 61 | + config["runtime"] = "nvidia" |
16 | 62 |
|
17 |
| - def create_container(self, cmd, image=None, is_gpu=False, ports=None, user=""): |
18 |
| - home_dir = "/vault/" |
19 |
| - if user != "": |
20 |
| - home_dir = home_dir + user |
| 63 | + c = self.docker_client.containers.run(image, "", **config) |
| 64 | + |
| 65 | + return c |
21 | 66 |
|
22 |
| - if ports is not None: |
23 |
| - self.ports['8888/tcp'] = ports[0] |
24 |
| - self.ports['6006/tcp'] = ports[1] |
25 | 67 |
|
26 |
| - if is_gpu: |
27 |
| - c = self.client.containers.run(image, cmd, auto_remove=True, ports=self.ports, devices=self.gpu_devices, volume_driver=self.nvidia_driver, volumes=self.volumes, detach=True, working_dir=home_dir) |
| 68 | + def run(self, image, cmd="", **kwargs): |
| 69 | + #defaults |
| 70 | + config = {} |
| 71 | + environment = {} |
| 72 | + for arg in kwargs: |
| 73 | + if arg == "driver_capabilities": |
| 74 | + environment["NVIDIA_DRIVER_CAPABILITIES"] = kwargs["driver_capabilities"] |
| 75 | + elif arg == "visible_devices" in kwargs: |
| 76 | + vis_devices = "" |
| 77 | + if type(kwargs["visible_devices"]) is list: |
| 78 | + if len(kwargs["visible_devices"]) == 1: |
| 79 | + vis_devices = str(kwargs["visible_devices"][0]) |
| 80 | + else: |
| 81 | + for dev in kwargs["visible_devices"]: |
| 82 | + vis_devices += dev + ',' |
| 83 | + vis_devices = vis_devices[:-1] |
| 84 | + elif type(kwargs["visible_devices"]) is str: |
| 85 | + vis_devices = kwargs["visible_device"] |
| 86 | + elif type(kwargs["visible_devices"]) is int: |
| 87 | + vis_devices = str(kwargs["visible_devices"]) |
| 88 | + environment["NVIDIA_VISIBLE_DEVICES"] = vis_devices |
| 89 | + elif arg == "disable_require" in kwargs: |
| 90 | + environment["NVIDIA_DISABLE_REQUIRE"] = kwargs["disable_require"] |
| 91 | + elif arg == "require": |
| 92 | + if "cuda" in kwargs["require"]: |
| 93 | + environment["NVIDIA_REQUIRE_CUDA"] = kwargs["require"]["cuda"] |
| 94 | + if "driver" in kwargs["require"]: |
| 95 | + environment["NVIDIA_REQUIRE_DRIVER"] = kwargs["require"]["driver"] |
| 96 | + if "arch" in kwargs["require"]: |
| 97 | + environment["NVIDIA_REQUIRE_ARCH"] = kwargs["require"]["arch"] |
| 98 | + elif arg == "cuda_version": |
| 99 | + print("WARNING: the CUDA_VERSION enviorment variable is a legacy variable, consider moving to NVIDIA_REQUIRE_CUDA") |
| 100 | + environment["CUDA_VERSION"] = kwargs["cuda_version"] |
| 101 | + elif arg == "environment": |
| 102 | + if type(kwargs["environment"]) is dict: |
| 103 | + for k,v in kwargs["environment"]: |
| 104 | + environment[k] = v |
| 105 | + elif type(kwargs["environment"]) is list: |
| 106 | + for e in kwargs["environment"]: |
| 107 | + kv = e.split("=") |
| 108 | + assert(len(kv) == 2), "Does not follow the format SOMEVAR=xxx" |
| 109 | + environment[kv[0]] = kv[1] |
| 110 | + else: |
| 111 | + config[arg] = kwargs[arg] |
| 112 | + config["environment"] = environment |
| 113 | + config["runtime"] = "nvidia" |
| 114 | + |
| 115 | + c = self.docker_client.containers.run(image, cmd, **config) |
| 116 | + |
| 117 | + if cmd == "": |
| 118 | + return c.id |
28 | 119 | else:
|
29 |
| - c = self.client.containers.run(image, cmd, auto_remove=True, detach=True, working_dir=home_dir) |
30 |
| - |
31 |
| - return c.id |
| 120 | + return c |
32 | 121 |
|
33 | 122 | def build_image(self, path):
|
34 |
| - img = self.client.images.build(path); |
| 123 | + img = self.docker_client.images.build(path); |
35 | 124 | return img
|
36 | 125 |
|
37 | 126 | def get_container_logs(self, cid):
|
38 |
| - c = self.client.containers.get(cid) |
| 127 | + c = self.docker_client.containers.get(cid) |
39 | 128 | return c.logs()
|
40 | 129 |
|
41 | 130 | def get_all_container_ids(self):
|
42 |
| - return self.client.containers.list() |
| 131 | + return self.docker_client.containers.list() |
43 | 132 |
|
44 | 133 | def stop_container(self, cid):
|
45 |
| - c = self.client.containers.get(cid) |
| 134 | + c = self.docker_client.containers.get(cid) |
46 | 135 | c.stop()
|
47 | 136 |
|
48 | 137 | def start_container(self, cid):
|
49 |
| - c = self.client.containers.get(cid) |
| 138 | + c = self.docker_client.containers.get(cid) |
50 | 139 | c.start()
|
51 | 140 |
|
52 | 141 | def start_all_containers(self):
|
53 |
| - for c in self.client.containers.list(): |
| 142 | + for c in self.docker_client.containers.list(): |
54 | 143 | c.start()
|
55 | 144 |
|
56 | 145 | def stop_all_containers(self):
|
57 |
| - for c in self.client.containers.list(): |
| 146 | + for c in self.docker_client.containers.list(): |
58 | 147 | c.stop()
|
59 | 148 |
|
60 |
| - def run_cmd(self, cid, cmd): |
61 |
| - c = self.client.containers.get(cid) |
| 149 | + def exec_run(self, cid, cmd): |
| 150 | + c = self.docker_client.containers.get(cid) |
62 | 151 | return c.exec_run(cmd)
|
63 |
| - |
| 152 | + |
| 153 | + @staticmethod |
| 154 | + def list_gpus(): |
| 155 | + output = check_output(["nvidia-smi", "-L"]).decode("utf-8") |
| 156 | + regex = re.compile(r"GPU (?P<id>\d+):") |
| 157 | + gpus = [] |
| 158 | + for line in output.strip().split("\n"): |
| 159 | + m = regex.match(line) |
| 160 | + assert m, "unable to parse " + line |
| 161 | + gpus.append(int(m.group("id"))) |
| 162 | + return gpus |
| 163 | + |
| 164 | + @staticmethod |
| 165 | + def gpu_memory_usage(): |
| 166 | + output = check_output(["nvidia-smi"]).decode("utf-8") |
| 167 | + smi_output = output[output.find("GPU Memory"):] |
| 168 | + rows = smi_output.split("\n") |
| 169 | + regex = re.compile(r"[|]\s+?(?P<id>\d+)\D+?(?P<pid>\d+).+[ ](?P<usage>\d+)MiB") |
| 170 | + usage = {gpu_id: 0 for gpu_id in NVDockerClient.list_gpus()} |
| 171 | + for row in smi_output.split("\n"): |
| 172 | + gpu = regex.search(row) |
| 173 | + if not gpu: |
| 174 | + continue |
| 175 | + id = int(gpu.group("id")) |
| 176 | + memory = int(gpu.group("usage")) |
| 177 | + usage[id] += memory |
| 178 | + return usage |
0 commit comments