Merge pull request #12 from paxtonfitzpatrick/remote_submit

paxtonfitzpatrick · web-flow · commit a9e5a838cbf8 · 2020-01-15T00:23:52.000-05:00
solid framework for revamp, including basic functionality for submitting jobs to cluster from local machine
diff --git a/_helpers.py b/_helpers.py
@@ -0,0 +1,107 @@
+import hashlib
+import os
+from os.path import realpath, join as opj, sep as pathsep
+import sys
+from configparser import ConfigParser
+
+
+def attempt_load_config():
+    """
+    tries to load config file from expected path in instances where neither a
+    filepath or dict-like object is provided
+    """
+    splitpath = realpath(__file__).split(pathsep)
+    try:
+        try:
+            # get path to project root directory
+            splitroot = splitpath[: splitpath.index('cluster-tools-dartmouth') + 1]
+            project_root = pathsep.join(splitroot)
+            config_dir = opj(project_root, 'configs')
+        except ValueError as e:
+            # pass exceptions onto broad outer exception for function
+            raise FileNotFoundError(f"cluster-tools-dartmouth not found in path\
+             {realpath(__file__)}").with_traceback(e.__traceback__)
+
+        configs = os.listdir(config_dir)
+        # filter out hidden files and the template config
+        configs = [f for f in configs if not (f.startswith('template') or f.startswith('.'))]
+        if len(configs) == 1:
+            config_path = opj(config_dir, configs[0])
+            config = parse_config(config_path)
+            return config
+        else:
+            # fail if multiple or no config files are found
+            raise FileNotFoundError(f"Unable to determine which config file to \
+            read from {len(configs)} choices in {config_dir}")
+
+    except FileNotFoundError as e:
+        raise FileNotFoundError("Failed to load config file from expected \
+        location").with_traceback(e.__traceback__)
+
+
+def md5_checksum(filepath):
+    """
+    computes the MD5 checksum of a local file to compare against remote
+
+    NOTE: MD5 IS CONSIDERED CRYPTOGRAPHICALLY INSECURE
+    (see https://en.wikipedia.org/wiki/MD5#Security)
+    However, it's still very much suitable in cases (like ours) where one
+    wouldn't expect **intentional** data corruption
+    """
+    hash_md5 = hashlib.md5()
+    with open(filepath, 'rb') as f:
+        # avoid having to read the whole file into memory at once
+        for chunk in iter(lambda: f.read(4096), b''):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+
+
+def parse_config(config_path):
+    """
+    parses various user-specifc options from config file in configs dir
+    """
+    raw_config = ConfigParser(inline_comment_prefixes='#')
+    with open(config_path, 'r') as f:
+        raw_config.read_file(f)
+
+    config = dict(raw_config['CONFIG'])
+    config['confirm_overwrite_on_upload'] = raw_config.getboolean(
+        'CONFIG', 'confirm_overwrite_on_upload'
+    )
+    return config
+
+
+def prompt_input(question, default=None):
+    """
+    given a question, prompts user for command line input
+    returns True for 'yes'/'y' and False for 'no'/'n' responses
+
+    """
+    assert default in ('yes', 'no', None), \
+        "Default response must be either 'yes', 'no', or None"
+
+    valid_responses = {
+        'yes': True,
+        'y': True,
+        'no': False,
+        'n': False
+    }
+
+    if default is None:
+        prompt = "[y/n]"
+    elif default == 'yes':
+        prompt = "[Y/n]"
+    else:
+        prompt = "[y/N]"
+
+    while True:
+        sys.stdout.write(f"{question}\n{prompt}")
+        response = input().lower()
+        # if user hits return without typing, return default response
+        if (default is not None) and (not response):
+            return valid_responses[default]
+        elif response in valid_responses:
+            return valid_responses[response]
+        else:
+            sys.stdout.write("Please respond with either 'yes' (or 'y') \
+            or 'no' (or 'n')\n")
diff --git a/cluster_scripts/config.py b/cluster_scripts/config.py
@@ -1,38 +1,24 @@
-import socket
-import os
-from ..helpers import parse_config
+from os.path import dirname, realpath, join as opj
 
-config = dict()
+job_config = dict()
 
 # ====== MODIFY ONLY THE CODE BETWEEN THESE LINES ======
 # job creation options
-
-# ******** check kiewit hostname from eduroam ********
-if (socket.gethostname() == 'Paxtons-MacBook-Pro') or (socket.gethostname() == 'Paxtons-MacBook-Pro.kiewit.dartmouth.edu') or (socket.gethostname() == 'Paxtons-MacBook-Pro.local'):
-    config['datadir'] = '/Users/paxtonfitzpatrick/Documents/Dartmouth/Thesis/memory-dynamics/data/models/participants/trajectories'
-    config['workingdir'] = config['datadir']
-    config['startdir'] = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))  # directory to start the job in
-    config['template'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'run_job_local.sh')
-else:
-    config['datadir'] = os.path.join('/dartfs/rc/lab/D/DBIC/CDL/f0028ph/eventseg', 'trajectories')
-    config['workingdir'] = '/dartfs/rc/lab/D/DBIC/CDL/f0028ph/eventseg/cluster-scripts'
-    config['startdir'] = '/dartfs/rc/lab/D/DBIC/CDL/f0028ph/eventseg/'
-    config['template'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'run_job_cluster.sh')
-
-config['scriptdir'] = os.path.join(config['workingdir'], 'scripts')
-config['lockdir'] = os.path.join(config['workingdir'], 'locks')
-config['resultsdir'] = os.path.join(config['workingdir'], 'results')
-
+job_config['startdir'] = # path to the foler for this project.  Should be something like /dartfs/rc/lab/D/DBIC/CDL/<your_net_id>/<project_name>
+job_config['datadir'] = opj(job_config['startir'], 'data')
+job_config['workingdir'] = opj(job_config['startir'], 'scripts')
+job_config['template'] = opj(dirname(realpath(__file__)), 'run_job_cluster.sh')
+job_config['scriptdir'] = opj(job_config['workingdir'], 'scripts')
+job_config['lockdir'] = opj(job_config['workingdir'], 'locks')
 
 # runtime options
-config['jobname'] = # (str) default job name
-config['q'] =   # (str) options: default, test, largeq
-config['nnodes'] = # (int) how many nodes to use for this one job
-config['ppn'] = # (int) how many processors to use for this one job (assume 4GB of RAM per processor)
-config['walltime'] =  # (str) maximum runtime, in h:MM:SS
-config['cmd_wrapper'] =  # (str) replace with actual command wrapper (e.g. matlab, python, etc.)
-config['modules']
-
-#extra options
-
+job_config['jobname'] = # (str) default job name
+job_config['q'] =   # (str) options: default, test, largeq (when in doubt, use "largeq")
+job_config['nnodes'] = # (int) how many nodes to use for this one job
+job_config['ppn'] = # (int) how many processors to use for this one job (assume 4GB of RAM per processor)
+job_config['walltime'] =  # (str) maximum runtime, in h:MM:SS (e.g., "10:00:00")
+job_config['cmd_wrapper'] =  # (str) replace with actual command wrapper (e.g. "python", "matlab", etc.)
+job_config['modules'] = # (str) modules you need to load for your scripts separated by a space (e.g., "python matlab")
+job_config['env_type'] = # (str) what kind of Python environment you use (NOTE: sole option is currently conda -- venv and virtualenv coming soon!)
+job_config['env_name'] = # (str) names of (currently, conda) environment you want your submission script and jobs to run in
 # ====== MODIFY ONLY THE CODE BETWEEN THESE LINES ======
diff --git a/configs/template_config.txt b/configs/template_config.txt
@@ -1,10 +1,10 @@
 # DO NOT EDIT THIS FILE
 # You don't want to push sensitive information to GitHub by mistake!
-# Duplicate this file, rename it config.txt and fill in each field with your personal options
+# Duplicate this file, rename it as you see fit, and fill in each field with your personal options
 
 [CONFIG]
 hostname = # name of host where you plan to run jobs
 username = # your username for logging into your cluster account
-password = # your password (remember make sure you don't push this to GitHub!)
+password = # your password (NOTE: remember make sure you don't push this to GitHub!)
 submit_command = # command used to submit jobs (mksub if your username starts with f00, otherwise qsub)
-confirm_overwrite_on_upload = true
+confirm_overwrite_on_upload = # (true or false) whether or not you want to be prompted when overwriting existing remote files with local changes
diff --git a/helpers.py b/helpers.py
diff --git a/remote_submit.py b/remote_submit.py
@@ -0,0 +1,88 @@
+import os
+from os.path import dirname, realpath, join as opj
+from spurplus import connect_with_retries
+from .upload_scripts import upload_scripts
+from ._helpers import attempt_load_config, parse_config
+from .cluster_scripts.config import job_config
+
+
+def remote_submit(config_path=None, sync_changes=False, await_output=False):
+    """
+    main function that handles submitting jobs on the cluster from your local
+    machine
+
+    :param config_path: (str, optional, default: None) path to your config file.
+    If you created your config following the instructions in
+    configs/template_config.txt, you can simply leave this empty
+    :param sync_changes: (bool, default: False) if True, upload any local
+    changes to cluster scripts before submitting jobs
+    :param await_output: (bool, default: False) if True, keep the connection with
+    the remote open until your submit script is finished creating jobs.
+    Otherwise, terminate the connection after callin the submit script and allow
+    job submission to happen in the background.
+    WARNING: This can be rather lengthy process depending on the number of jobs
+    you're running. Setting this to True opens you up to the possibility that
+    the ssh connection may fail before job submission is finished
+    :return: None (other than some hopefully some results, eventually!)
+    """
+    if config_path is None:
+        config = attempt_load_config()
+    else:
+        config = parse_config(config_path)
+
+    hostname = config['hostname']
+    username = config['username']
+    password = config['password']
+    confirm_overwrite = config['confirm_overwrite_on_upload']
+
+    modules = job_config['modules']
+    env_type = job_config['env_type']
+    env_name = job_config['env_name']
+    submit_cmd_wrapper = job_config['cmd_wrapper']
+    # TODO: ability to handle custom-named submission script
+    submit_script_path = opj(job_config['workingdir'], 'submit.py')
+
+    # pre-submission commands to be concatenated and run together in remote shell
+    remote_cmds = ['sh', '-c']
+    # command for loading module(s)
+    module_load_cmd = f'module load {modules}'
+    # command activating virtual environment
+    if env_type == 'conda':
+        activate_cmd = 'source activate'
+    else:
+        # TODO: add commands for venv & virtualenv activation
+        raise ValueError("Only conda environments are currently supported")
+    env_activate_cmd = f'{activate_cmd} {env_name}'
+    # command for calling submit script
+    submit_cmd = f'{submit_cmd_wrapper} {submit_script_path}'
+
+    full_submission_cmd = ' && '.join([
+        module_load_cmd,
+        env_activate_cmd,
+        submit_cmd
+    ])
+
+    remote_cmds.append(full_submission_cmd)
+
+
+
+
+    with connect_with_retries(
+            hostname=hostname,
+            username=username,
+            password=password
+    ) as cluster:
+        if sync_changes:
+            script_dir = opj(dirname(realpath(__file__)), 'cluster_scripts')
+            upload_scripts(
+                cluster,
+                script_dir,
+                job_config,
+                confirm_overwrite=confirm_overwrite
+            )
+
+        if await_output:
+            output = cluster.check_output(remote_cmds)
+            print(output)
+        else:
+            cluster.run(remote_cmds)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+spurplus
diff --git a/upload_scripts.py b/upload_scripts.py
@@ -0,0 +1,64 @@
+import os
+from os.path import dirname, realpath, join as opj
+from spurplus import connect_with_retries
+from ._helpers import md5_checksum, attempt_load_config, prompt_input
+from .cluster_scripts.config import job_config
+
+
+def upload_scripts(remote_shell, local_script_dir, job_conf, confirm_overwrite=True):
+    remote_startdir = job_conf['startdir']
+    remote_workingdir = job_conf['workingdir']
+    remote_datadir = job_conf['datadir']
+
+    to_upload = os.listdir(local_script_dir)
+    # ignore hidden files (e.g., .DS_Store on MacOS)
+    to_upload = [f for f in to_upload if not f.startswith('.')]
+    for remote_dir in [remote_startdir, remote_workingdir, remote_datadir]:
+        try:
+            remote_shell.is_dir(remote_dir)
+        except FileNotFoundError:
+            # is_dir method raises exception if path doesn't exist
+            print(f'creating remote directory: {remote_dir}')
+            remote_shell.mkdir(remote_dir)
+
+    print("uploading scripts...")
+    for file in to_upload:
+        src_path = opj(local_script_dir, file)
+        dest_path = opj(remote_workingdir, file)
+        if remote_shell.exists(dest_path):
+            # don't bother uploading file if it hasn't been edited
+            local_checksum = md5_checksum(src_path)
+            remote_checksum = remote_shell.md5(dest_path)
+            if local_checksum == remote_checksum:
+                print(f"skipping {file} (no changes)")
+                continue
+
+            if confirm_overwrite:
+                # prompt for confirmation of overwrite if option is enabled
+                question = f"{file}: overwrite remote version with local changes?"
+                overwrite_confirmed = prompt_input(question)
+                if not overwrite_confirmed:
+                    print(f"skipping {file} (overwrite declined)")
+                    continue
+
+        remote_shell.put(src_path, dest_path, create_directories=False)
+        print(f"uploaded {file}")
+    print("finished uploading scripts")
+
+
+# setup for running as a stand-alone script
+if __name__ == '__main__':
+    config = attempt_load_config()
+    hostname = config['hostname']
+    username = config['username']
+    password = config['password']
+    confirm_overwrite = config['confirm_overwrite_on_upload']
+
+    script_dir = opj(dirname(realpath(__file__)), 'cluster_scripts')
+
+    with connect_with_retries(
+        hostname=hostname,
+        username=username,
+        password=password
+    ) as cluster:
+        upload_scripts(cluster, script_dir, job_config, confirm_overwrite=confirm_overwrite)