-
Notifications
You must be signed in to change notification settings - Fork 25
Open
Description
Hi, after installing rid-kit in slurm computation environment, I encountered the following error output after running the command,
DFLOW_DEBUG=1 rid submit \
-i ./tests/data/000 \
-c ./rid/template/rid_gmx_dih.json \
-m ./rid/template/machine_slurm_gpu_cpu.json \
-d ala-dipeptide-1
In which I modified the machine_slurm_gpu_cpu.json part of the parameter,
{
"resources": {
"local_machine": {
"executor":{
"image": "dptechnology/dpdispatcher:latest",
"merge_sliced_step": "True",
"machine_dict":{
"batch_type": "Slurm",
"context_type": "Local",
"local_root" : "./",
"remote_root": ""
},
"resources_dict":{
"number_node": 1,
"cpu_per_node": 8,
"gpu_per_node": 1,
"queue_name": "gpu",
"group_size": 1,
"custom_flags": [
"#SBATCH --time=72:00:00",
"#SBATCH --exclude=gpu05,gpu07"
],
"source_list": [
"export CONDA_PREFIX=/home/jygu/.conda/envs/rid-kit",
"conda activate /home/jygu/.conda/envs/rid-kit",
"export LIBRARY_PATH=/home/jygu/.conda/envs/rid-kit/lib:$LIBRARY_PATH",
"export LD_LIBRARY_PATH=/home/jygu/.conda/envs/rid-kit/lib:$LD_LIBRARY_PATH",
"export LD_LIBRARY_PATH=/opt/ohpc/pub/compiler/gcc/12.2.0/lib64:$LD_LIBRARY_PATH",
"export PYTHONPATH=/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages:$PYTHONPATH"
]
}
}
},
"local_machine_cpu": {
"executor":{
"image": "dptechnology/dpdispatcher:latest",
"merge_sliced_step": "True",
"machine_dict":{
"batch_type": "Slurm",
"context_type": "Local",
"local_root" : "./",
"remote_root": ""
},
"resources_dict":{
"number_node": 1,
"cpu_per_node": 8,
"gpu_per_node": 0,
"queue_name": "cpu",
"group_size": 1,
"custom_flags": [
"#SBATCH --time=120:00:00"
],
"source_list": [
"export CONDA_PREFIX=/home/jygu/.conda/envs/rid-kit",
"conda activate /home/jygu/.conda/envs/rid-kit",
"export LIBRARY_PATH=/home/jygu/.conda/envs/rid-kit/lib:$LIBRARY_PATH",
"export LD_LIBRARY_PATH=/home/jygu/.conda/envs/rid-kit/lib:$LD_LIBRARY_PATH",
"export LD_LIBRARY_PATH=/opt/ohpc/pub/compiler/gcc/12.2.0/lib64:$LD_LIBRARY_PATH",
"export PYTHONPATH=/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages:$PYTHONPATH"
]
}
}
}
},
"tasks": {
"prep_exploration_config": "local_machine_cpu",
"run_exploration_config": "local_machine",
"prep_label_config": "local_machine_cpu",
"run_label_config": "local_machine",
"prep_select_config": "local_machine",
"run_select_config": "local_machine",
"prep_data_config": "local_machine",
"run_train_config": "local_machine",
"model_devi_config": "local_machine",
"workflow_steps_config": "local_machine_cpu"
}
}
Please help me to resolve it at your convenience
nohup: ignoring input
2024-11-27 11:30:37 | INFO | rid.entrypoint.main | Preparing RiD ...
Traceback (most recent call last):
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1672, in run
self.exec(scope, parameters, context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1854, in exec
self.exec_pod(scope, parameters, item)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 2100, in exec_pod
raise RuntimeError("Run %s failed" % args)
RuntimeError: Run ['python3', '/home/jygu/wxy/rid_kit/rid-kit-master/ala-dipeptide-1/iter-001-prep-exploration-0/script'] failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1919, in exec_steps
steps.run(scope.workflow_id, context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/steps.py", line 291, in run
step.run(self, context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1676, in run
raise RuntimeError("Step %s failed" % self)
RuntimeError: Step prep-exploration failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1672, in run
self.exec(scope, parameters, context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1852, in exec
self.exec_steps(scope, parameters, item, context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1924, in exec_steps
raise RuntimeError("Step %s failed" % self)
RuntimeError: Step Exploration failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1919, in exec_steps
steps.run(scope.workflow_id, context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/steps.py", line 291, in run
step.run(self, context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1676, in run
raise RuntimeError("Step %s failed" % self)
RuntimeError: Step Exploration failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1672, in run
self.exec(scope, parameters, context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1852, in exec
self.exec_steps(scope, parameters, item, context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1924, in exec_steps
raise RuntimeError("Step %s failed" % self)
RuntimeError: Step reinforced-dynamics-block failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1919, in exec_steps
steps.run(scope.workflow_id, context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/steps.py", line 291, in run
step.run(self, context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1676, in run
raise RuntimeError("Step %s failed" % self)
RuntimeError: Step reinforced-dynamics-block failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1672, in run
self.exec(scope, parameters, context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1852, in exec
self.exec_steps(scope, parameters, item, context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1924, in exec_steps
raise RuntimeError("Step %s failed" % self)
RuntimeError: Step rid-procedure failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/workflow.py", line 337, in submit
entrypoint.run(self.id, self.context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/steps.py", line 291, in run
step.run(self, context)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dflow/step.py", line 1676, in run
raise RuntimeError("Step %s failed" % self)
RuntimeError: Step rid-procedure failed
Workflow is running locally (ID: ala-dipeptide-1)
2024-11-27 11:31:29,697 - INFO : info:check_all_finished: False
2024-11-27 11:31:29,740 - INFO : job: a9d7d4853786fd5eacd7be7238065c476b953af3 submit; job_id is 803892
2024-11-27 11:46:33,688 - INFO : job: a9d7d4853786fd5eacd7be7238065c476b953af3 803892 finished
2024-11-27 11:46:38,665 - INFO : info:check_all_finished: False
2024-11-27 11:46:38,692 - INFO : job: f4007d8c7fd7061241ccf4088f7083376cf1d0f3 submit; job_id is 803893
2024-11-27 12:02:11,216 - INFO : job: f4007d8c7fd7061241ccf4088f7083376cf1d0f3 803893 finished
2024-11-27 12:02:13,472 - INFO : info:check_all_finished: False
2024-11-27 12:02:13,516 - INFO : job: 26dad25f1e072c05c7b3c041933108bd8515ec3c submit; job_id is 803896
2024-11-27 12:02:13,534 - INFO : job: bc817200d66825111065f773a821b7170a88bf87 submit; job_id is 803897
2024-11-27 12:02:13,551 - INFO : job: a9b29add52b7c29d91b5b26b2c3aae99ff2d27d7 submit; job_id is 803898
2024-11-27 12:02:13,568 - INFO : job: d3de077e16150b4b84e7186904477af4be382f7c submit; job_id is 803899
2024-11-27 12:16:16,845 - INFO : job: 26dad25f1e072c05c7b3c041933108bd8515ec3c 803896 terminated;fail_cout is 1; resubmitting job
2024-11-27 12:16:16,864 - INFO : job:26dad25f1e072c05c7b3c041933108bd8515ec3c re-submit after terminated; new job_id is 803913
2024-11-27 12:16:17,079 - INFO : job:26dad25f1e072c05c7b3c041933108bd8515ec3c job_id:803913 after re-submitting; the state now is <JobStatus.waiting: 2>
2024-11-27 12:16:17,079 - INFO : job: bc817200d66825111065f773a821b7170a88bf87 803897 terminated;fail_cout is 1; resubmitting job
2024-11-27 12:16:17,097 - INFO : job:bc817200d66825111065f773a821b7170a88bf87 re-submit after terminated; new job_id is 803914
2024-11-27 12:16:17,310 - INFO : job:bc817200d66825111065f773a821b7170a88bf87 job_id:803914 after re-submitting; the state now is <JobStatus.waiting: 2>
2024-11-27 12:16:17,310 - INFO : job: a9b29add52b7c29d91b5b26b2c3aae99ff2d27d7 803898 terminated;fail_cout is 1; resubmitting job
2024-11-27 12:16:17,338 - INFO : job:a9b29add52b7c29d91b5b26b2c3aae99ff2d27d7 re-submit after terminated; new job_id is 803915
2024-11-27 12:16:17,550 - INFO : job:a9b29add52b7c29d91b5b26b2c3aae99ff2d27d7 job_id:803915 after re-submitting; the state now is <JobStatus.running: 3>
2024-11-27 12:16:17,550 - INFO : job: d3de077e16150b4b84e7186904477af4be382f7c 803899 terminated;fail_cout is 1; resubmitting job
2024-11-27 12:16:17,572 - INFO : job:d3de077e16150b4b84e7186904477af4be382f7c re-submit after terminated; new job_id is 803916
2024-11-27 12:16:17,783 - INFO : job:d3de077e16150b4b84e7186904477af4be382f7c job_id:803916 after re-submitting; the state now is <JobStatus.waiting: 2>
2024-11-27 12:47:54,102 - INFO : job: 26dad25f1e072c05c7b3c041933108bd8515ec3c 803913 terminated;fail_cout is 2; resubmitting job
2024-11-27 12:47:54,144 - INFO : job:26dad25f1e072c05c7b3c041933108bd8515ec3c re-submit after terminated; new job_id is 803970
2024-11-27 12:47:54,427 - INFO : job:26dad25f1e072c05c7b3c041933108bd8515ec3c job_id:803970 after re-submitting; the state now is <JobStatus.running: 3>
2024-11-27 12:47:54,428 - INFO : job: bc817200d66825111065f773a821b7170a88bf87 803914 terminated;fail_cout is 2; resubmitting job
2024-11-27 12:47:54,447 - INFO : job:bc817200d66825111065f773a821b7170a88bf87 re-submit after terminated; new job_id is 803971
2024-11-27 12:47:54,659 - INFO : job:bc817200d66825111065f773a821b7170a88bf87 job_id:803971 after re-submitting; the state now is <JobStatus.waiting: 2>
2024-11-27 12:47:54,659 - INFO : job: a9b29add52b7c29d91b5b26b2c3aae99ff2d27d7 803915 terminated;fail_cout is 2; resubmitting job
2024-11-27 12:47:54,680 - INFO : job:a9b29add52b7c29d91b5b26b2c3aae99ff2d27d7 re-submit after terminated; new job_id is 803972
2024-11-27 12:47:54,892 - INFO : job:a9b29add52b7c29d91b5b26b2c3aae99ff2d27d7 job_id:803972 after re-submitting; the state now is <JobStatus.waiting: 2>
2024-11-27 12:47:54,892 - INFO : job: d3de077e16150b4b84e7186904477af4be382f7c 803916 terminated;fail_cout is 2; resubmitting job
2024-11-27 12:47:54,913 - INFO : job:d3de077e16150b4b84e7186904477af4be382f7c re-submit after terminated; new job_id is 803973
2024-11-27 12:47:55,125 - INFO : job:d3de077e16150b4b84e7186904477af4be382f7c job_id:803973 after re-submitting; the state now is <JobStatus.waiting: 2>
2024-11-27 12:52:55,888 - INFO : job: 26dad25f1e072c05c7b3c041933108bd8515ec3c 803970 terminated;fail_cout is 3; resubmitting job
Traceback (most recent call last):
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dpdispatcher/submission.py", line 353, in handle_unexpected_submission_state
job.handle_unexpected_job_state()
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dpdispatcher/submission.py", line 851, in handle_unexpected_job_state
raise RuntimeError(err_msg)
RuntimeError: job:26dad25f1e072c05c7b3c041933108bd8515ec3c 803970 failed 3 times. job_detail:{'26dad25f1e072c05c7b3c041933108bd8515ec3c': {'job_task_list': [{'command': 'sed -i "s#\\$(pwd)#$(pwd)#g" script2 && python3 script2', 'task_work_path': './', 'forward_files': ['script2', './/tmp/inputs/artifacts/topology', './/tmp/inputs/artifacts/conf', './/tmp/inputs/artifacts/dflow_python_packages'], 'backward_files': ['.//tmp/outputs/artifacts/task_path', './/tmp/outputs/parameters/cv_dim_2', 'log'], 'outlog': 'log', 'errlog': 'log'}], 'resources': {'number_node': 1, 'cpu_per_node': 8, 'gpu_per_node': 0, 'queue_name': 'cpu', 'group_size': 1, 'custom_flags': ['#SBATCH --time=120:00:00'], 'strategy': {'if_cuda_multi_devices': False, 'ratio_unfinished': 0.0}, 'para_deg': 1, 'module_purge': False, 'module_unload_list': [], 'module_list': [], 'source_list': ['export CONDA_PREFIX=/home/jygu/.conda/envs/rid-kit', 'conda activate /home/jygu/.conda/envs/rid-kit', 'export LIBRARY_PATH=/home/jygu/.conda/envs/rid-kit/lib:$LIBRARY_PATH', 'export LD_LIBRARY_PATH=/home/jygu/.conda/envs/rid-kit/lib:$LD_LIBRARY_PATH', 'export LD_LIBRARY_PATH=/opt/ohpc/pub/compiler/gcc/12.2.0/lib64:$LD_LIBRARY_PATH', 'export PYTHONPATH=/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages:$PYTHONPATH'], 'envs': {'DFLOW_WORKFLOW': 'ala-dipeptide-1', 'DFLOW_POD': 'iter-001-prep-exploration-0', 'PYTHONUNBUFFERED': 'true', 'ARGO_TEMPLATE': "''"}, 'prepend_script': [], 'append_script': [], 'wait_time': 0, 'kwargs': {}}, 'job_state': <JobStatus.terminated: 4>, 'job_id': '803970', 'fail_count': 3}}
Possible remote error message: �[31m==> /home/jygu/wxy/rid_kit/rid-kit-master/ala-dipeptide-1/iter-001-prep-exploration-0/workdir/d158c24da9879e8890a921023117f9b33c496398/./log <==
d-kit/lib/python3.9/site-packages/rid/common/mol.py", line 5, in <module>
File "/home/jygu/wxy/rid_kit/rid-kit-master/ala-dipeptide-1/iter-001-prep-exploration-0/workdir/d158c24da9879e8890a921023117f9b33c496398/tmp/inputs/artifacts/dflow_python_packages/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/rid/common/mol.py", line 5, in <module>
import mdtraj as mdfrom rid.common.mol import get_distance_from_atomid,get_dihedral_value_from_resid
ModuleNotFoundError File "/home/jygu/wxy/rid_kit/rid-kit-master/ala-dipeptide-1/iter-001-prep-exploration-0/workdir/d158c24da9879e8890a921023117f9b33c496398/tmp/inputs/artifacts/dflow_python_packages/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/rid/common/mol.py", line 5, in <module>
: No module named 'mdtraj'
import mdtraj as mdimport mdtraj as md
ModuleNotFoundErrorModuleNotFoundError: : No module named 'mdtraj'No module named 'mdtraj'
import mdtraj as md
ModuleNotFoundError: No module named 'mdtraj'
�[0m
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/jygu/wxy/rid_kit/rid-kit-master/ala-dipeptide-1/iter-001-prep-exploration-0/script", line 105, in <module>
submission.run_submission(clean=True)
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dpdispatcher/submission.py", line 258, in run_submission
self.handle_unexpected_submission_state()
File "/home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages/dpdispatcher/submission.py", line 356, in handle_unexpected_submission_state
raise RuntimeError(
RuntimeError: Meet errors will handle unexpected submission state.
Debug information: remote_root==/home/jygu/wxy/rid_kit/rid-kit-master/ala-dipeptide-1/iter-001-prep-exploration-0/workdir/d158c24da9879e8890a921023117f9b33c496398.
Debug information: submission_hash==d158c24da9879e8890a921023117f9b33c496398.
Please check the dirs and scripts in remote_root. The job information mentioned above may help.
2024-11-27 12:52:55 | INFO | rid.entrypoint.main | The task is displayed on "https://127.0.0.1:2746".
2024-11-27 12:52:55 | INFO | rid.entrypoint.main | Artifacts (Files) are listed on "https://127.0.0.1:9001".
After I enter the command pip show mdtraj
Name: mdtraj
Version: 1.9.9
Summary: MDTraj: A modern, open library for the analysis of molecular dynamics trajectories
Home-page: http://mdtraj.org
Author: Robert McGibbon
Author-email: rmcgibbo@gmail.com
License: LGPLv2.1+
Location: /home/jygu/.conda/envs/rid-kit/lib/python3.9/site-packages
Requires: astunparse, numpy, pyparsing, scipy
Required-by: rid-kit
Metadata
Metadata
Assignees
Labels
No labels