14
14
from dkeras .utils .sys_functions import get_port , get_addr
15
15
16
16
17
+ def _cmd (cmd ):
18
+ return subprocess .check_output (cmd .split (' ' )).decode ()
19
+
20
+
17
21
def _which_qsub ():
18
- return subprocess .check_output (['which' , 'qsub' ]).decode ().replace ('\n ' , ' ' )
22
+ return subprocess .check_output (['which' , 'qsub' ]).decode ().replace ('\n ' , '' )
19
23
20
24
21
25
def _which_qstat ():
22
- return subprocess .check_output (['which' , 'qstat' ]).decode ().replace ('\n ' , ' ' )
26
+ return subprocess .check_output (['which' , 'qstat' ]).decode ().replace ('\n ' , '' )
23
27
24
28
25
29
def _which_qdel ():
26
- return subprocess .check_output (['which' , 'qdel' ]).decode ().replace ('\n ' , ' ' )
30
+ return subprocess .check_output (['which' , 'qdel' ]).decode ().replace ('\n ' , '' )
27
31
28
32
29
33
create_worker_script = """
@@ -42,19 +46,19 @@ def wait_for_workers(n_workers, timeout=300):
42
46
print ("Waiting for {} workers" .format (n_workers ))
43
47
while True :
44
48
n_nodes = ray .get (_get_n_nodes .remote ())
45
- if len ( n_nodes ) >= n_workers :
49
+ if n_nodes >= n_workers :
46
50
return True
47
51
if (start_time - time .time () >= timeout ):
48
52
return False
49
53
50
54
51
55
def rm_existing_workers (qstat_path = 'qstat' , qdel_path = 'qdel' ):
52
- cmd = "{} | grep worker_script | cut - d ' ' - f1 | xargs {}" .format (
56
+ cmd = "{} | grep worker_script | cut - d ' ' -f1 | xargs {}" .format (
53
57
qstat_path , qdel_path )
54
58
os .system (cmd )
55
59
56
60
57
- def init_pbs_ray (n_workers = 3 , rm_existing = True , iface_name = 'eno1' , worker_time = 3600 ):
61
+ def init_pbs_ray (n_workers = 3 , rm_existing = True , iface_name = 'eno1' , worker_time = 3600 , verbose = True ):
58
62
"""
59
63
60
64
:param n_workers:
@@ -65,25 +69,28 @@ def init_pbs_ray(n_workers=3, rm_existing=True, iface_name='eno1', worker_time=3
65
69
"""
66
70
if ray .is_initialized ():
67
71
if rm_existing :
68
- ray . shutdown ( )
72
+ _cmd ( 'ray stop' )
69
73
70
74
qsub_path = _which_qsub ()
71
75
qstat_path = _which_qstat ()
72
76
qdel_path = _which_qdel ()
73
77
74
- rm_existing_workers (qstat_path = qstat_path , qdel_path = qdel_path )
78
+ # rm_existing_workers(qstat_path=qstat_path, qdel_path=qdel_path)
79
+ rm_existing_workers ()
75
80
76
81
addresses = get_addr ('eno1' )
77
82
addr = addresses [0 ]
78
83
79
- print ("Address: " , addr )
84
+ if verbose :
85
+ print ("Address: " , addr )
80
86
if addr == 'No IP addr' :
81
87
raise Exception ("Address not found for {}" .format (iface_name ))
82
88
83
89
port = get_port ()[1 ]
84
- print ("Port " , port )
85
- print ("ray start --head --redis-port={}" .format (port ))
86
- os .system ('ray start --head --redis-port={}' .format (port ))
90
+ if verbose :
91
+ print ("Port " , port )
92
+ print ("ray start --head --redis-port={}" .format (port ))
93
+ _cmd ('ray start --head --redis-port={}' .format (port ))
87
94
88
95
temp_dir = 'temp_{}' .format ('_' .join (str (time .time ()).split ('.' )))
89
96
if not os .path .exists (temp_dir ):
@@ -93,28 +100,29 @@ def init_pbs_ray(n_workers=3, rm_existing=True, iface_name='eno1', worker_time=3
93
100
worker_script = create_worker_script .format (addr , port , worker_time )
94
101
with open (worker_file , 'w' ) as f :
95
102
f .write (worker_script )
96
- print ("Worker file " , worker_file )
103
+
104
+ if verbose :
105
+ print ("Worker file " , worker_file )
97
106
98
107
qsub_pids = []
99
108
for i in range (n_workers ):
100
- print ("{} {}" .format (qsub_path , worker_file ))
101
- print (list (qsub_path ))
102
- print (list (worker_file ))
109
+ if verbose :
110
+ print ("{} {}" .format (qsub_path , worker_file ))
111
+ print (list (qsub_path ))
112
+ print (list (worker_file ))
103
113
104
- qsub_pid = subprocess .check_output ([qsub_path , '-l ' , 'nodes=1:ppn=2 ' , worker_file ])
114
+ qsub_pid = subprocess .check_output ([qsub_path , '-lselect=1 ' , '-lplace=excl ' , worker_file ])
105
115
qsub_pid = qsub_pid .decode ()[:- 1 ].split ('.' )[0 ]
106
116
qsub_pids .append (qsub_pid )
107
117
108
- # os.system('{} -l nodes=1:ppn=2 {}'.format(qsub_path, worker_file))
109
- # print("{} {}".format(qsub_path, worker_file))
110
- print ("{}:{}" .format (addr , port ))
111
118
ray .init (redis_address = '{}:{}' .format (addr , port ))
112
119
print ("Ray initialized" )
113
120
return wait_for_workers (n_workers + 1 ), qsub_pids
114
121
115
122
116
123
def main ():
117
124
init_pbs_ray ()
125
+ print (ray .nodes ())
118
126
119
127
120
128
if __name__ == "__main__" :
0 commit comments