@@ -169,6 +169,60 @@ def get_cmd(self, environment, active_resources):
169169 return mpirun_cmd + export_cmd + python_exec + [self .user_script
170170 ] + self .user_arguments
171171
172+ class JSRunner (MultiNodeRunner ):
173+ def __init__ (self , args , world_info_base64 , resource_pool ):
174+ super ().__init__ (args , world_info_base64 )
175+ self .resource_pool = resource_pool
176+ # Hard coded for Summit
177+ self .add_export ('CUDA_VISIBLE_DEVICES' , '0,1,2,3,4,5' )
178+
179+ def backend_exists (self ):
180+ #TODO: if IB is available we should suggestion mvapich
181+ #This ompi check will still work for jsrun since spectrum-mpi is based on ompi
182+ return shutil .which ('ompi_info' )
183+
184+ @property
185+ def name (self ):
186+ return "jsrun"
187+
188+ def validate_args (self ):
189+ super ().validate_args ()
190+ #TODO: Allow for include/exclude at node-level but not gpu-level
191+ if self .args .include != "" or self .args .exclude != "" :
192+ raise ValueError (
193+ f"{ self .name } backend does not support worker include/exclusion" )
194+ if self .args .num_nodes != - 1 or self .args .num_gpus != - 1 :
195+ raise ValueError (
196+ f"{ self .name } backend does not support limiting num nodes/gpus" )
197+
198+ def get_cmd (self , environment , active_resources ):
199+ total_process_count = sum (self .resource_pool .values ())
200+
201+ jsrun_cmd = [
202+ 'jsrun' ,
203+ '-n' ,
204+ f'{ total_process_count } ' ,
205+ '-c' ,
206+ f'{ 7 } ' ,
207+ '-g' ,
208+ f'{ 1 } ' ,
209+ '-a' ,
210+ f'{ 1 } ' ,
211+
212+ ] + split (self .args .launcher_args )
213+
214+ export_cmd = []
215+ for k , v in self .exports .items ():
216+ export_cmd += ['-E' , "{}={}" .format (k , v )]
217+
218+ python_exec = []
219+ if not self .args .no_python :
220+ python_exec = [sys .executable , "-u" ]
221+ if self .args .module :
222+ python_exec .append ("-m" )
223+
224+ return jsrun_cmd + export_cmd + python_exec + [self .user_script
225+ ] + self .user_arguments
172226
173227class MPICHRunner (MultiNodeRunner ):
174228 def __init__ (self , args , world_info_base64 , resource_pool ):
0 commit comments