From 97761d98afadcaf7c3611776e461329a0da39dc8 Mon Sep 17 00:00:00 2001 From: Andre Merzky Date: Fri, 4 Aug 2023 17:50:34 +0200 Subject: [PATCH] response to comments --- src/radical/pilot/agent/agent_0.py | 4 ++- src/radical/pilot/agent/executing/popen.py | 8 +++++- src/radical/pilot/configs/agent_debug_sa.json | 9 ------- src/radical/pilot/configs/agent_default.json | 25 ++++++++----------- .../pilot/tmgr/staging_input/default.py | 2 +- 5 files changed, 22 insertions(+), 26 deletions(-) diff --git a/src/radical/pilot/agent/agent_0.py b/src/radical/pilot/agent/agent_0.py index 25cf24503b..6f4aabe269 100644 --- a/src/radical/pilot/agent/agent_0.py +++ b/src/radical/pilot/agent/agent_0.py @@ -472,7 +472,9 @@ def _start_sub_agents(self): if target == 'local': # start agent locally - cmdline = '/bin/sh -l %s/bootstrap_2.sh %s' % (self._pwd, sa) + bs_name = '%s/bootstrap_2.sh' % (self._pwd) + args = ' '.join([self._sid, self.cfg.reg_addr, sa]) + cmdline = '/bin/sh -l %s/%s %s' % (self._pwd, bs_name, args) else: # target == 'node': diff --git a/src/radical/pilot/agent/executing/popen.py b/src/radical/pilot/agent/executing/popen.py index eac191ef1e..351080df47 100644 --- a/src/radical/pilot/agent/executing/popen.py +++ b/src/radical/pilot/agent/executing/popen.py @@ -548,6 +548,12 @@ def _get_rp_env(self, task): if sbox.startswith(self._pwd): sbox = '$RP_PILOT_SANDBOX%s' % sbox[len(self._pwd):] + gpr = td['gpus_per_rank'] + if int(gpr) == gpr: + gpr = '%d' % gpr + else: + gpr = '%f' % gpr + ret = '\n' ret += 'export RP_TASK_ID="%s"\n' % tid ret += 'export RP_TASK_NAME="%s"\n' % name @@ -560,7 +566,7 @@ def _get_rp_env(self, task): ret += 'export RP_TASK_SANDBOX="%s"\n' % sbox ret += 'export RP_REGISTRY_ADDRESS="%s"\n' % self._session.reg_addr ret += 'export RP_CORES_PER_RANK=%d\n' % td['cores_per_rank'] - ret += 'export RP_GPUS_PER_RANK=%d\n' % td['gpus_per_rank'] + ret += 'export RP_GPUS_PER_RANK=%s\n' % gpr # FIXME AM # ret += 'export RP_LFS="%s"\n' % self.lfs diff --git a/src/radical/pilot/configs/agent_debug_sa.json b/src/radical/pilot/configs/agent_debug_sa.json index e74a62fd50..dac7a4a11d 100644 --- a/src/radical/pilot/configs/agent_debug_sa.json +++ b/src/radical/pilot/configs/agent_debug_sa.json @@ -31,15 +31,6 @@ "stall_hwm" : 1, "bulk_size" : 1024}, - "funcs_wrk_queue" : { "kind" : "queue", - "log_level" : "error", - "stall_hwm" : 1, - "bulk_size" : 0}, - "funcs_res_queue" : { "kind" : "queue", - "log_level" : "error", - "stall_hwm" : 1, - "bulk_size" : 0}, - "agent_unschedule_pubsub" : { "kind" : "pubsub", "log_level" : "error", "stall_hwm" : 1, diff --git a/src/radical/pilot/configs/agent_default.json b/src/radical/pilot/configs/agent_default.json index 2c4e67dde5..5384ae4e94 100644 --- a/src/radical/pilot/configs/agent_default.json +++ b/src/radical/pilot/configs/agent_default.json @@ -23,24 +23,21 @@ # stall_hwm and batch_size is 1 (no stalling, no bulking). # "bridges" : { - "agent_staging_input_queue" : {"kind": "queue", "log_lvl":"debug"}, - "agent_scheduling_queue" : {"kind": "queue", "log_lvl":"debug"}, - "agent_executing_queue" : {"kind": "queue", "log_lvl":"debug"}, - "agent_staging_output_queue" : {"kind": "queue", "log_lvl":"debug"}, - "agent_collecting_queue" : {"kind": "queue", "log_lvl":"debug"}, + "agent_staging_input_queue" : {"kind": "queue", "log_lvl":"error"}, + "agent_scheduling_queue" : {"kind": "queue", "log_lvl":"error"}, + "agent_executing_queue" : {"kind": "queue", "log_lvl":"error"}, + "agent_staging_output_queue" : {"kind": "queue", "log_lvl":"error"}, + "agent_collecting_queue" : {"kind": "queue", "log_lvl":"error"}, - "funcs_req_queue" : {"kind": "queue", "log_lvl":"debug"}, - "funcs_res_queue" : {"kind": "queue", "log_lvl":"debug"}, + "raptor_scheduling_queue" : {"kind": "queue", "log_lvl":"error"}, - "raptor_scheduling_queue" : {"kind": "queue", "log_lvl":"debug"}, + "agent_unschedule_pubsub" : {"kind": "pubsub", "log_lvl":"error"}, + "agent_schedule_pubsub" : {"kind": "pubsub", "log_lvl":"error"}, - "agent_unschedule_pubsub" : {"kind": "pubsub", "log_lvl":"debug"}, - "agent_schedule_pubsub" : {"kind": "pubsub", "log_lvl":"debug"}, + "control_pubsub" : {"kind": "pubsub", "log_lvl":"error"}, + "state_pubsub" : {"kind": "pubsub", "log_lvl":"error"} - "control_pubsub" : {"kind": "pubsub", "log_lvl":"debug"}, - "state_pubsub" : {"kind": "pubsub", "log_lvl":"debug"} - - # "log_pubsub" : {"kind": "pubsub", "log_lvl":"debug"} + # "log_pubsub" : {"kind": "pubsub", "log_lvl":"error"} }, "components" : { diff --git a/src/radical/pilot/tmgr/staging_input/default.py b/src/radical/pilot/tmgr/staging_input/default.py index 9d03fd6bed..1fbe782dab 100644 --- a/src/radical/pilot/tmgr/staging_input/default.py +++ b/src/radical/pilot/tmgr/staging_input/default.py @@ -210,7 +210,7 @@ def work(self, tasks): task_sboxes = sboxes[pid] - if False or len(task_sboxes) >= self._mkdir_threshold: + if len(task_sboxes) >= self._mkdir_threshold: self._log.debug('tar %d sboxes', len(task_sboxes)) session_sbox = self._session._get_session_sandbox(pilot)