From e36bf43f17df70b185dbf834e6616def249dbae2 Mon Sep 17 00:00:00 2001 From: Shahzeb Siddiqui Date: Thu, 9 May 2024 14:58:41 -0400 Subject: [PATCH 1/5] remove cobalt scheduler implementation from buildtest source code. --- buildtest/builders/base.py | 12 +- buildtest/config.py | 47 +------ buildtest/executors/cobalt.py | 207 ------------------------------- buildtest/executors/setup.py | 7 +- buildtest/scheduler/cobalt.py | 128 ------------------- buildtest/scheduler/detection.py | 26 ---- 6 files changed, 5 insertions(+), 422 deletions(-) delete mode 100644 buildtest/executors/cobalt.py delete mode 100644 buildtest/scheduler/cobalt.py diff --git a/buildtest/builders/base.py b/buildtest/builders/base.py index f17b875ae..fbfd4e855 100644 --- a/buildtest/builders/base.py +++ b/buildtest/builders/base.py @@ -144,7 +144,7 @@ def __init__( self.logger.debug(f"Processing Buildspec File: {self.buildspec}") self.logger.debug(f"Processing Test: {self.name}") - # get type attribute from Executor class (local, slurm, cobalt, pbs, lsf) + # get type attribute from Executor class (local, slurm, pbs, lsf) self.executor_type = buildexecutor.executors[self.executor].type self.buildexecutor = buildexecutor @@ -748,7 +748,7 @@ def _default_test_variables(self): return lines def sched_init(self): - """This method will resolve scheduler fields: 'sbatch', 'pbs', 'bsub', 'cobalt'""" + """This method will resolve scheduler fields: 'sbatch', 'pbs', 'bsub'""" self.sbatch = deep_get( self.recipe, "executors", self.executor, "sbatch" ) or self.recipe.get("sbatch") @@ -758,9 +758,6 @@ def sched_init(self): self.pbs = deep_get( self.recipe, "executors", self.executor, "pbs" ) or self.recipe.get("pbs") - self.cobalt = deep_get( - self.recipe, "executors", self.executor, "cobalt" - ) or self.recipe.get("cobalt") self.burstbuffer = self.recipe.get("BB") or deep_get( self.recipe, "executors", self.executor, "BB" @@ -796,11 +793,6 @@ def get_job_directives(self): lines.append(f"#PBS -o {self.name}.o") lines.append(f"#PBS -e {self.name}.e") - if self.cobalt: - for line in self.cobalt: - lines.append(f"#COBALT {line}") - lines.append(f"#COBALT --jobname={self.name}") - return lines def _get_burst_buffer(self, burstbuffer): diff --git a/buildtest/config.py b/buildtest/config.py index 72544f2c4..da4441d8a 100644 --- a/buildtest/config.py +++ b/buildtest/config.py @@ -9,7 +9,7 @@ console, ) from buildtest.exceptions import BuildTestError, ConfigurationError -from buildtest.scheduler.detection import LSF, PBS, Cobalt, Slurm, Torque +from buildtest.scheduler.detection import LSF, PBS, Slurm, Torque from buildtest.schemas.defaults import custom_validator from buildtest.schemas.utils import load_recipe, load_schema from buildtest.utils.file import resolve_path @@ -46,7 +46,6 @@ def __init__(self, settings_file=None, verbose=None): "slurm": {}, "lsf": {}, "pbs": {}, - "cobalt": {}, "torque": {}, "container": {}, } @@ -172,7 +171,6 @@ def _executor_check(self): self._validate_local_executors() self._validate_slurm_executors() self._validate_lsf_executors() - self._validate_cobalt_executors() self._validate_pbs_executors() self._validate_torque_executors() self._validate_container_executors() @@ -331,49 +329,6 @@ def _validate_slurm_executors(self): "setting": slurm_executor[executor] } - def _validate_cobalt_executors(self): - """Validate cobalt queue property by running ```qstat -Ql ``. If - its a non-zero exit code then queue doesn't exist otherwise it is a valid - queue. - """ - - cobalt_executor = deep_get(self.target_config, "executors", "cobalt") - if not cobalt_executor: - - if self.verbose: - console.print( - "No Cobalt executors found in configuration file", style="bold blue" - ) - - return - - executor_type = "cobalt" - - cobalt = Cobalt(custom_dirs=deep_get(self.target_config, "paths", "cobalt")) - if not cobalt.active(): - return - - queue_info = cobalt.queues() - - for executor in cobalt_executor: - executor_name = f"{self.name()}.{executor_type}.{executor}" - - if self.is_executor_disabled(cobalt_executor[executor]): - self.disabled_executors.append(executor_name) - continue - - queue = cobalt_executor[executor].get("queue") - # if queue property defined in cobalt executor name check if it exists - if queue not in queue_info: - logger.error( - f"Cobalt queue '{queue}' does not exist. Available Cobalt queues: {queue_info} " - ) - continue - - self.valid_executors[executor_type][executor_name] = { - "setting": cobalt_executor[executor] - } - def _validate_pbs_executors(self): """Validate pbs queue property by running by checking if queue is found and queue is 'enabled' and 'started' which are two properties found in pbs queue diff --git a/buildtest/executors/cobalt.py b/buildtest/executors/cobalt.py deleted file mode 100644 index 7d5904cb2..000000000 --- a/buildtest/executors/cobalt.py +++ /dev/null @@ -1,207 +0,0 @@ -"""This method implements CobaltExecutor class which is defines how cobalt executor -submit job to Cobalt scheduler.""" - -import json -import logging -import os -import re -import shutil -import time - -from buildtest.defaults import console -from buildtest.executors.base import BaseExecutor -from buildtest.scheduler.cobalt import CobaltJob -from buildtest.utils.file import is_file, read_file -from buildtest.utils.tools import check_binaries, deep_get - -logger = logging.getLogger(__name__) - - -class CobaltExecutor(BaseExecutor): - """The CobaltExecutor class is responsible for submitting jobs to Cobalt Scheduler. - The class implements the following methods: - - - **load**: load Cobalt executors from configuration file - - **dispatch**: submit Cobalt job to scheduler - - **poll**: poll Cobalt job via qstat and retrieve job state - - **gather**: gather job record including output, error, exit code - """ - - type = "cobalt" - - def __init__( - self, name, settings, site_configs, account=None, maxpendtime=None, timeout=None - ): - self.account = account - self.maxpendtime = maxpendtime - super().__init__(name, settings, site_configs, timeout=timeout) - - self.queue = self._settings.get("queue") - self.custom_dirs = deep_get(site_configs.target_config, "paths", "cobalt") - - def launcher_command(self, numprocs, numnodes): - self.cobalt_cmds = check_binaries( - ["qsub", "qstat", "qdel"], custom_dirs=self.custom_dirs - ) - batch_cmd = [self.cobalt_cmds["qsub"]] - - if self.queue: - batch_cmd += [f"-q {self.queue}"] - - if self.account: - batch_cmd += [f"--project {self.account}"] - - if numprocs: - batch_cmd += [f"--proccount={self.numprocs}"] - - if numnodes: - batch_cmd += [f"--nodecount={self.numnodes}"] - - if self.launcher_opts: - batch_cmd += [" ".join(self.launcher_opts)] - - return batch_cmd - - def run(self, builder): - """This method is responsible for dispatching job to Cobalt Scheduler by invoking ``builder.run()`` - which runs the build script. If job is submitted to scheduler, we get the JobID and pass this to - ``CobaltJob`` class. At job submission, cobalt will report the output and error file which can be retrieved - using **qstat**. We retrieve the cobalt job record using ``builder.job.gather()``. - - Args: - builder (buildtest.buildsystem.base.BuilderBase): An instance object of BuilderBase type - """ - - os.chdir(builder.stage_dir) - - cmd = f"{self.shell} {os.path.basename(builder.build_script)}" - - timeout = self.timeout or self._buildtestsettings.target_config.get("timeout") - - command = builder.run(cmd, timeout) - - if command.returncode() != 0: - builder.failed() - return builder - - out = command.get_output() - out = " ".join(out) - - # convert JobID into integer - job_id = int(out) - builder.metadata["jobid"] = job_id - - builder.job = CobaltJob(job_id, self.cobalt_cmds) - - msg = f"[blue]{builder}[/]: JobID: {builder.metadata['jobid']} dispatched to scheduler" - console.print(msg) - logger.debug(msg) - - # output and error file in format .output and .error we set full path to file. By - # default Cobalt will write file into current directory where job is submitted. We assume output and error - # file names are not set in job script - - builder.metadata["outfile"] = os.path.join( - builder.stage_dir, builder.job.output_file() - ) - builder.metadata["errfile"] = os.path.join( - builder.stage_dir, builder.job.error_file() - ) - - logger.debug(f"Output file will be written to: {builder.metadata['outfile']}") - logger.debug(f"Error file will be written to: {builder.metadata['errfile']}") - - # gather job record - builder.job.retrieve_jobdata() - builder.metadata["job"] = builder.job.jobdata() - logger.debug(json.dumps(builder.metadata["job"], indent=2)) - - return builder - - def poll(self, builder): - """This method is responsible for polling Cobalt job by invoking the builder method - ``builder.job.poll()``. We check the job state and existence of output file. If file - exists or job is complete, we gather the results and return from function. If job - is pending we check if job time exceeds ``maxpendtime`` time limit and cancel job. - - Args: - builder (buildtest.buildsystem.base.BuilderBase): An instance object of BuilderBase type - """ - - builder.job.poll() - # Cobalt job can disappear if job is complete so we check when outputfile exists as an indicator when job is finished - if is_file(builder.metadata["outfile"]) or builder.job.is_complete(): - # builder.job_state = "exiting" - self.gather(builder) - return - - builder.stop() - - if builder.job.is_running(): - builder.job.elapsedtime = time.time() - builder.job.starttime - builder.job.elapsedtime = round(builder.job.elapsedtime, 2) - if self._cancel_job_if_elapsedtime_exceeds_timeout(builder): - return - - if builder.job.is_suspended() or builder.job.is_pending(): - if self._cancel_job_if_pendtime_exceeds_maxpendtime(builder): - return - builder.start() - - def gather(self, builder): - """This method is responsible for moving output and error file in the run - directory. We need to read ``.cobaltlog`` file which contains - output of exit code by performing a regular expression. - The cobalt log file will contain a line: **task completed normally with an exit code of 0; initiating job cleanup and removal** - - Args: - builder (buildtest.buildsystem.base.BuilderBase): An instance object of BuilderBase type - """ - - builder.record_endtime() - # The cobalt job will write output and error file after job completes, there is a few second delay before file comes. Hence - # stay in while loop and sleep for every 5 second until we find both files in filesystem - while True: - interval = 5 - if is_file(builder.metadata["outfile"]) and is_file( - builder.metadata["errfile"] - ): - break - logger.debug( - f"Sleeping {interval} seconds and waiting for Cobalt Scheduler to write output and error file" - ) - time.sleep(interval) - - # builder.metadata["output"] = read_file(builder.metadata["outfile"]) - # builder.metadata["error"] = read_file(builder.metadata["errfile"]) - - cobaltlog = os.path.join(builder.stage_dir, builder.job.cobalt_log()) - - logger.debug(f"Cobalt Log File written to {cobaltlog}") - - # if os.path.exists(cobaltlog): - content = read_file(cobaltlog) - pattern = r"(exit code of.)(\d+)(\;)" - # pattern to check in cobalt log file is 'exit code of ;' - m = re.search(pattern, content) - if m: - rc = int(m.group(2)) - builder.metadata["result"]["returncode"] = rc - logger.debug( - f"Test: {builder.name} got returncode: {rc} from JobID: {builder.job.jobid}" - ) - else: - logger.debug( - f"Error in regular expression: '{pattern}'. Unable to find returncode please check your cobalt log file" - ) - - shutil.copy2( - cobaltlog, os.path.join(builder.test_root, os.path.basename(cobaltlog)) - ) - logger.debug( - f"Copying cobalt log file: {cobaltlog} to {os.path.join(builder.test_root,os.path.basename(cobaltlog))}" - ) - - console.print(f"[blue]{builder}[/]: Job {builder.job.get()} is complete! ") - - builder.post_run_steps() diff --git a/buildtest/executors/setup.py b/buildtest/executors/setup.py index 54283fdce..f4607300d 100644 --- a/buildtest/executors/setup.py +++ b/buildtest/executors/setup.py @@ -1,8 +1,7 @@ """ This module is responsible for setup of executors defined in buildtest configuration. The BuildExecutor class initializes the executors and chooses the -executor class (LocalExecutor, LSFExecutor, SlurmExecutor, CobaltExecutor) to call depending -on executor name. +executor class to call depending on executor name. """ import logging @@ -18,7 +17,6 @@ from buildtest.defaults import BUILDTEST_EXECUTOR_DIR, console from buildtest.exceptions import BuildTestError, ExecutorError from buildtest.executors.base import BaseExecutor -from buildtest.executors.cobalt import CobaltExecutor from buildtest.executors.container import ContainerExecutor from buildtest.executors.local import LocalExecutor from buildtest.executors.lsf import LSFExecutor @@ -94,7 +92,6 @@ def __init__( "lsf": LSFExecutor, "pbs": PBSExecutor, "torque": TorqueExecutor, - "cobalt": CobaltExecutor, "container": ContainerExecutor, } @@ -397,7 +394,7 @@ def poll(self, pending_jobs): # for every pending job poll job and mark if job is finished or cancelled for job in jobs: - # get executor instance for corresponding builder. This would be one of the following: SlurmExecutor, PBSExecutor, LSFExecutor, CobaltExecutor + # get executor instance for corresponding builder. This would be one of the following: SlurmExecutor, PBSExecutor, LSFExecutor executor = self.get(job.executor) executor.poll(job) diff --git a/buildtest/scheduler/cobalt.py b/buildtest/scheduler/cobalt.py deleted file mode 100644 index 55015aeea..000000000 --- a/buildtest/scheduler/cobalt.py +++ /dev/null @@ -1,128 +0,0 @@ -import logging -import time - -from buildtest.scheduler.job import Job -from buildtest.utils.command import BuildTestCommand - -logger = logging.getLogger(__name__) - - -class CobaltJob(Job): - """The ``CobaltJob`` class performs operation on cobalt job upon job submission such - as polling job, gather job record, cancel job. We also retrieve job state and determine if job - is pending, running, complete, suspended. - """ - - def __init__(self, jobID, cobalt_cmds): - super().__init__(jobID) - self._outfile = str(jobID) + ".output" - self._errfile = str(jobID) + ".error" - self._cobaltlog = str(jobID) + ".cobaltlog" - self.cobalt_cmds = cobalt_cmds - - def is_pending(self): - """Return ``True`` if job is pending otherwise returns ``False``. When cobalt recieves job it is - in ``starting`` followed by ``queued`` state. We check if job is in either state. - """ - - return self._state in ["queued", "starting"] - - def is_running(self): - """Return ``True`` if job is running otherwise returns ``False``. Cobalt job state for running job is - is marked as ``running``""" - - return self._state == "running" - - def is_complete(self): - """Return ``True`` if job is complete otherwise returns ``False``. Cobalt job state for completed job - job is marked as ``exiting``""" - - return self._state == "exiting" - - def is_suspended(self): - """Return ``True`` if job is suspended otherwise returns ``False``. Cobalt job state for suspended is - marked as ``user_hold``""" - - return self._state == "user_hold" - - def is_cancelled(self): - """Return ``True`` if job is cancelled otherwise returns ``False``. Job state is ``cancelled`` which - is set by class ``cancel`` method - """ - - return self._state == "cancelled" - - def cobalt_log(self): - """Return job cobalt.log file""" - - return self._cobaltlog - - def poll(self): - """Poll job by running ``qstat -l --header State `` which retrieves job state.""" - - # get Job State by running 'qstat -l --header ' - query = f"{self.cobalt_cmds['qstat']} -l --header State {self.jobid}" - logger.debug(f"Getting Job State for '{self.jobid}' by running: '{query}'") - cmd = BuildTestCommand(query) - cmd.execute() - output = cmd.get_output() - - output = " ".join(output).strip() - - # Output in format State: so we need to get value of state - job_state = output.partition(":")[2].strip() - - if job_state: - self._state = job_state - - logger.debug(f"Job ID: '{self.jobid}' Job State: {self._state}") - - if self.is_running() and not self.starttime: - self.starttime = time.time() - - def retrieve_jobdata(self): - """Gather Job state by running **qstat -lf ** which retrieves all fields. - The output is in text format which is parsed into key/value pair and stored in a dictionary. This method will - return a dict containing the job record - - .. code-block:: console - - $ qstat -lf 347106 - JobID: 347106 - JobName : hold_job - User : shahzebsiddiqui - WallTime : 00:10:00 - QueuedTime : 00:13:14 - RunTime : N/A - TimeRemaining : N/A - - """ - - # 'qstat -lf ' will get all fields of Job. - qstat_cmd = f"{self.cobalt_cmds['qstat']} -lf {self.jobid}" - logger.debug(f"Executing command: {qstat_cmd}") - cmd = BuildTestCommand(qstat_cmd) - cmd.execute() - output = cmd.get_output() - - job_record = {} - # The output if in format KEY: VALUE so we store all records in a dictionary - for line in output: - key, sep, value = line.partition(":") - key = key.strip() - value = value.strip() - job_record[key] = value - - self._jobdata = job_record - - def cancel(self): - """Cancel job by running ``qdel ``. This method is called if job timer exceeds - ``maxpendtime`` if job is pending. - """ - - query = f"{self.cobalt_cmds['qdel']} {self.jobid}" - logger.debug(f"Cancelling job {self.jobid} by running: {query}") - cmd = BuildTestCommand(query) - cmd.execute() - - self._state = "cancelled" diff --git a/buildtest/scheduler/detection.py b/buildtest/scheduler/detection.py index 1fa20cc84..c46ebdbc3 100644 --- a/buildtest/scheduler/detection.py +++ b/buildtest/scheduler/detection.py @@ -286,32 +286,6 @@ def validate_queue(self, executor): return True -class Cobalt(Scheduler): - """The Cobalt class checks for Cobalt binaries and gets a list of Cobalt queues""" - - # specify a set of Cobalt commands to check for file existence - binaries = ["qsub", "qstat", "qdel", "nodelist", "showres", "partlist"] - - def get_queues(self): - """Get all Cobalt queues by running ``qstat -Ql`` and parsing output""" - - query = f"{self.sched_cmds['qstat']} -Ql" - cmd = BuildTestCommand(query) - cmd.execute() - content = cmd.get_output() - - self.logger.debug(f"Get all Cobalt Queues by running {query}") - # remove all None from list - content = list(filter(None, content)) - - queues = [] - for line in content: - if line.startswith("Name"): - name = line.partition(":")[2].strip() - queues.append(name) - return queues - - class PBS(Scheduler): """The PBS class checks for PBS binaries and gets a list of available queues""" From 48b81a694ff0963ccd95bcc2a6ca021c46014fea Mon Sep 17 00:00:00 2001 From: Shahzeb Siddiqui Date: Thu, 9 May 2024 15:47:05 -0400 Subject: [PATCH 2/5] remove all references of cobalt in buildspec examples, configuration file examples and example tests. remove reference of 'cobalt' from json schema --- .../valid/job_submission.yml | 8 --- .../invalid/invalid_executor_names.yml | 5 -- .../valid/cobalt-example.yml | 40 --------------- .../valid/combined_executor.yml | 5 -- buildtest/schemas/script.schema.json | 3 -- buildtest/schemas/settings.schema.json | 48 +----------------- buildtest/schemas/spack.schema.json | 1 - buildtest/settings/config.yml | 4 -- general_tests/sched/cobalt/commands.yml | 50 ------------------- .../valid_builds/sched_directives.yml | 7 --- tests/examples/jlse/hold_job.yml | 8 --- tests/examples/jlse/hostname.yml | 8 --- tests/settings/jlse.yml | 46 ----------------- tests/test_jlse.py | 32 ------------ 14 files changed, 1 insertion(+), 264 deletions(-) delete mode 100644 buildtest/schemas/examples/settings.schema.json/valid/cobalt-example.yml delete mode 100644 general_tests/sched/cobalt/commands.yml delete mode 100644 tests/examples/jlse/hold_job.yml delete mode 100644 tests/examples/jlse/hostname.yml delete mode 100644 tests/settings/jlse.yml delete mode 100644 tests/test_jlse.py diff --git a/buildtest/schemas/examples/script.schema.json/valid/job_submission.yml b/buildtest/schemas/examples/script.schema.json/valid/job_submission.yml index 32c577790..fbf2aff12 100644 --- a/buildtest/schemas/examples/script.schema.json/valid/job_submission.yml +++ b/buildtest/schemas/examples/script.schema.json/valid/job_submission.yml @@ -19,11 +19,3 @@ buildspecs: - "-N 1" run: hostname - cobalt_example: - type: script - executor: generic.local.bash - description: This test runs hostname using cobalt directives - cobalt: - - "-t 30" - - "-n 1" - run: hostname diff --git a/buildtest/schemas/examples/settings.schema.json/invalid/invalid_executor_names.yml b/buildtest/schemas/examples/settings.schema.json/invalid/invalid_executor_names.yml index 47085a8d1..ab934fccc 100644 --- a/buildtest/schemas/examples/settings.schema.json/invalid/invalid_executor_names.yml +++ b/buildtest/schemas/examples/settings.schema.json/invalid/invalid_executor_names.yml @@ -18,11 +18,6 @@ system: launcher: bsub queue: batch options: ["-q batch", "-t 00:10"] - cobalt: - : - launcher: qsub - queue: normal - options: ["-n 1", "-t 10"] compilers: compiler: gcc: diff --git a/buildtest/schemas/examples/settings.schema.json/valid/cobalt-example.yml b/buildtest/schemas/examples/settings.schema.json/valid/cobalt-example.yml deleted file mode 100644 index c7da4cc79..000000000 --- a/buildtest/schemas/examples/settings.schema.json/valid/cobalt-example.yml +++ /dev/null @@ -1,40 +0,0 @@ -system: - generic: - hostnames: ['.*'] - moduletool: lmod - poolsize: 1 - buildspecs: - # whether to rebuild cache file automatically when running `buildtest buildspec find` - rebuild: False - # limit number of records to display when running `buildtest buildspec find` - count: 15 - # format fields to display when running `buildtest buildspec find`, By default we will show name,description - format: "name,description" - # enable terse mode - terse: False - report: - count: 25 - #enable terse mode for report - terse: False - format: "name,id,state,runtime,returncode" - paths: - cobalt: /usr/bin - executors: - defaults: - maxpendtime: 30 - local: - bash: - description: submit jobs via bash shell - shell: bash - cobalt: - knl: - queue: knl - haswell: - queue: haswell - compilers: - compiler: - gcc: - default: - cc: /usr/bin/gcc - cxx: /usr/bin/g++ - fc: /usr/bin/gfortran diff --git a/buildtest/schemas/examples/settings.schema.json/valid/combined_executor.yml b/buildtest/schemas/examples/settings.schema.json/valid/combined_executor.yml index 628c3234b..7eca33bb8 100644 --- a/buildtest/schemas/examples/settings.schema.json/valid/combined_executor.yml +++ b/buildtest/schemas/examples/settings.schema.json/valid/combined_executor.yml @@ -18,7 +18,6 @@ system: terse: False format: "name,id,state,runtime,returncode" paths: - cobalt: /usr/bin pbs: /usr/bin torque: /usr/bin lsf: /usr/bin @@ -36,10 +35,6 @@ system: batch: queue: batch options: ["-q batch", "-t 00:10"] - cobalt: - normal: - queue: normal - options: ["-n 1", "-t 10"] pbs: dev: queue: development diff --git a/buildtest/schemas/script.schema.json b/buildtest/schemas/script.schema.json index aba7c06c6..e01453a26 100644 --- a/buildtest/schemas/script.schema.json +++ b/buildtest/schemas/script.schema.json @@ -28,9 +28,6 @@ "bsub": { "$ref": "definitions.schema.json#/definitions/bsub" }, - "cobalt": { - "$ref": "definitions.schema.json#/definitions/cobalt" - }, "pbs": { "$ref": "definitions.schema.json#/definitions/pbs" }, diff --git a/buildtest/schemas/settings.schema.json b/buildtest/schemas/settings.schema.json index 5c46ef25f..4fcc9f6ad 100644 --- a/buildtest/schemas/settings.schema.json +++ b/buildtest/schemas/settings.schema.json @@ -280,7 +280,7 @@ "executors": { "type": "object", "additionalProperties": false, - "description": "The executor section is used for declaring your executors that are responsible for running jobs. The executor section can be ``local``, ``lsf``, ``slurm``, ``cobalt``. The executors are referenced in buildspec using ``executor`` field.", + "description": "The executor section is used for declaring your executors that are responsible for running jobs. The executor section can be ``local``, ``lsf``, ``slurm``. The executors are referenced in buildspec using ``executor`` field.", "required": ["local"], "properties": { "defaults": { @@ -333,16 +333,6 @@ "$ref": "#/definitions/slurm" } }, - "cobalt": { - "type": "object", - "description": "The ``cobalt`` section is used for declaring Cobalt executors for running jobs using Cobalt scheduler", - "propertyNames": { - "pattern": "^[A-Za-z0-9_.-]+$" - }, - "additionalProperties": { - "$ref": "#/definitions/cobalt" - } - }, "pbs": { "type": "object", "description": "The ``pbs`` section is used for declaring PBS executors for running jobs using PBS scheduler", @@ -408,10 +398,6 @@ "type": "string", "description": "Specify path to lsf executable" }, - "cobalt": { - "type": "string", - "description": "Specify path to cobalt executable" - }, "pbs": { "type": "string", "description": "Specify path to pbs executable" @@ -597,38 +583,6 @@ "module": { "$ref": "#/definitions/module" } } }, - "cobalt": { - "type": "object", - "description": "An instance object of cobalt executor", - "additionalProperties": false, - "required": [ "queue" ], - "properties": { - "description": { "$ref": "#/definitions/description" }, - "options": { - "type": "array", - "items": { "type": "string" }, - "description": "Specify any options for ``qsub`` for this executor when running all jobs associated to this executor" - }, - "queue": { - "type": "string", - "description": "Specify the lsf queue you want to use ``-q ``" - }, - "before_script": { - "description": "The ``before_script`` section can be used to specify commands before start of test. The script will be sourced in active shell.", - "#ref": "#/definitions/script" - }, - "maxpendtime": { - "description": "overrides default ``maxpendtime`` value", - "$ref": "#/definitions/maxpendtime" - }, - "account": { - "description": "overrides default ``account`` value", - "$ref": "#/definitions/account" - }, - "disable": {"$ref": "#/definitions/disable"}, - "module": { "$ref": "#/definitions/module" } - } - }, "pbs": { "type": "object", "description": "An instance object of pbs executor", diff --git a/buildtest/schemas/spack.schema.json b/buildtest/schemas/spack.schema.json index ee38918a2..4f3b46852 100644 --- a/buildtest/schemas/spack.schema.json +++ b/buildtest/schemas/spack.schema.json @@ -23,7 +23,6 @@ "vars": { "$ref": "definitions.schema.json#/definitions/env" }, "sbatch": { "$ref": "definitions.schema.json#/definitions/sbatch" }, "bsub": { "$ref": "definitions.schema.json#/definitions/bsub" }, - "cobalt": { "$ref": "definitions.schema.json#/definitions/cobalt" }, "pbs": { "$ref": "definitions.schema.json#/definitions/pbs" }, "skip": { "$ref": "definitions.schema.json#/definitions/skip" }, "tags": { "$ref": "definitions.schema.json#/definitions/tags" }, diff --git a/buildtest/settings/config.yml b/buildtest/settings/config.yml index 1fb75f67d..f7f30d508 100644 --- a/buildtest/settings/config.yml +++ b/buildtest/settings/config.yml @@ -59,10 +59,6 @@ system: # directory path to search for torque binaries. #torque: "/usr/bin" - # directory path to search for cobalt binaries. - #cobalt: "/usr/bin" - - # start of executor configuration executors: # local executor is used to submit jobs on local machine. In this example we have 4 executors: bash, sh, csh, zsh that will submit jobs using bash, sh, csh, zsh shell respectively. diff --git a/general_tests/sched/cobalt/commands.yml b/general_tests/sched/cobalt/commands.yml deleted file mode 100644 index fe7b0379d..000000000 --- a/general_tests/sched/cobalt/commands.yml +++ /dev/null @@ -1,50 +0,0 @@ - -buildspecs: - qsub_version: - type: script - executor: generic.local.sh - tags: [cobalt] - description: print version for qsub command - run: qsub --version - - qselect_version: - type: script - executor: generic.local.sh - tags: [cobalt] - description: print version for qselect - run: qselect --version - - cqsub_version: - type: script - executor: generic.local.sh - description: print version for cqsub command - tags: [cobalt] - run: cqsub --version - - qdel_version: - type: script - executor: generic.local.sh - description: print version for qdel command - tags: [cobalt] - run: qdel --version - - qmove_version: - type: script - executor: generic.local.sh - description: print version for qmove command - tags: [cobalt] - run: qmove --version - - show_jobs: - type: script - executor: generic.local.sh - description: Show all jobs in queue - tags: [cobalt] - run: qstat - - show_queues: - type: script - executor: generic.local.sh - tags: [cobalt] - description: Show all queues - run: qstat -Qf \ No newline at end of file diff --git a/tests/buildsystem/valid_builds/sched_directives.yml b/tests/buildsystem/valid_builds/sched_directives.yml index 914047949..8d1c8e59d 100644 --- a/tests/buildsystem/valid_builds/sched_directives.yml +++ b/tests/buildsystem/valid_builds/sched_directives.yml @@ -13,13 +13,6 @@ buildspecs: bsub: ["-n 1", "-W 10"] run: hostname - cobalt_hostname: - type: script - executor: generic.local.bash - description: Cobalt batch generation check - cobalt: ["-n 1", "-t 10"] - run: hostname - pbs_hostname: type: script executor: generic.local.bash diff --git a/tests/examples/jlse/hold_job.yml b/tests/examples/jlse/hold_job.yml deleted file mode 100644 index fc5c5d8f2..000000000 --- a/tests/examples/jlse/hold_job.yml +++ /dev/null @@ -1,8 +0,0 @@ -buildspecs: - hold_job: - executor: jlse.cobalt.iris - type: script - tags: [jobs] - description: Hold Job in queue - cobalt: ["-n 1", "-t 10", "-h"] - run: hostname diff --git a/tests/examples/jlse/hostname.yml b/tests/examples/jlse/hostname.yml deleted file mode 100644 index 77903c088..000000000 --- a/tests/examples/jlse/hostname.yml +++ /dev/null @@ -1,8 +0,0 @@ -buildspecs: - hostname_test: - executor: jlse.cobalt.iris - type: script - tags: [jobs] - description: Run hostname as batch job - cobalt: ["-n 1", "-t 10"] - run: hostname diff --git a/tests/settings/jlse.yml b/tests/settings/jlse.yml deleted file mode 100644 index ca44366cf..000000000 --- a/tests/settings/jlse.yml +++ /dev/null @@ -1,46 +0,0 @@ -system: - jlse: - # hostnames on JLSE where jobs are run are jlsebatch[1-2] - hostnames: ['^jlsebatch([1-2]).*'] - moduletool: environment-modules - poolsize: 8 - max_jobs: 10 - pager: False - buildspecs: - rebuild: False - count: 15 - format: "name,description" - terse: False - report: - count: 25 - terse: False - format: "name,id,state,runtime,returncode" - executors: - defaults: - pollinterval: 30 - maxpendtime: 15 - local: - bash: - description: submit jobs on local machine using bash shell - shell: bash - sh: - description: submit jobs on local machine using sh shell - shell: sh - csh: - description: submit jobs on local machine using csh shell - shell: csh - python: - description: submit jobs on local machine using python shell - shell: python - cobalt: - iris: - queue: iris - compilers: - find: - gcc: "^(gcc)" - compiler: - gcc: - builtin_gcc: - cc: /usr/bin/gcc - cxx: /usr/bin/g++ - fc: /usr/bin/gfortran diff --git a/tests/test_jlse.py b/tests/test_jlse.py deleted file mode 100644 index 3c94c2066..000000000 --- a/tests/test_jlse.py +++ /dev/null @@ -1,32 +0,0 @@ -import os -import socket - -import pytest - -from buildtest.cli.build import BuildTest -from buildtest.cli.compilers import BuildtestCompilers -from buildtest.config import SiteConfiguration -from buildtest.utils.file import walk_tree - -hostname = socket.getfqdn() - - -def test_jlse(): - if not hostname.endswith("alcf.anl.gov"): - pytest.skip("Test runs only on JLSE Login Nodes with domain name alcf.anl.gov") - - here = os.path.dirname(os.path.abspath(__file__)) - settings_file = os.path.join(here, "settings", "jlse.yml") - - bc = SiteConfiguration(settings_file) - bc.detect_system() - bc.validate() - - buildspec_files = walk_tree(os.path.join(here, "examples", "jlse")) - - cmd = BuildTest(configuration=bc, buildspecs=buildspec_files) - cmd.build() - - # testing buildtest config compilers find - bc = BuildtestCompilers(configuration=bc) - bc.find_compilers() From 72eaeb1f8de260a75758258a9da20e69a713d14a Mon Sep 17 00:00:00 2001 From: Shahzeb Siddiqui Date: Thu, 9 May 2024 15:50:12 -0400 Subject: [PATCH 3/5] remove cobalt references from documentation --- docs/batch_support.rst | 62 -------------------- docs/buildspecs/spack.rst | 2 +- docs/configuring_buildtest/overview.rst | 5 +- docs/configuring_buildtest/site_examples.rst | 15 ----- docs/contributing/regression_testing.rst | 1 - docs/writing_buildspecs/multi_executor.rst | 2 +- 6 files changed, 4 insertions(+), 83 deletions(-) diff --git a/docs/batch_support.rst b/docs/batch_support.rst index 762555b50..33ff3c11b 100644 --- a/docs/batch_support.rst +++ b/docs/batch_support.rst @@ -1437,68 +1437,6 @@ scheduler commands that are used to submit and poll the job. Adding 1 test results to report file: /home/adaptive50/buildtest/var/report.json Writing Logfile to /home/adaptive50/buildtest/var/logs/buildtest_z2vikoox.log - -Cobalt -------- - -`Cobalt `_ is a job scheduler developed -by `Argonne National Laboratory `_ that runs on compute -resources and IBM BlueGene series. Cobalt resembles `PBS `_ -in terms of command line interface such as ``qsub``, ``qacct`` however they -slightly differ in their behavior. - -Cobalt support has been tested on JLSE and Theta system. Cobalt directives -are specified using ``#COBALT`` this can be specified using ``cobalt`` property -which accepts a list of strings. Shown below is an example using cobalt property. - -.. code-block:: yaml - :emphasize-lines: 5 - :linenos: - - buildspecs: - yarrow_hostname: - executor: jlse.cobalt.yarrow - type: script - cobalt: ["-n 1", "--proccount 1", "-t 10"] - run: hostname - -In this example, we allocate 1 node with 1 processor for 10min. This is translated into -the following job script. - -.. code-block:: console - - #!/usr/bin/bash - #COBALT -n 1 - #COBALT --proccount 1 - #COBALT -t 10 - #COBALT --jobname yarrow_hostname - source /home/shahzebsiddiqui/buildtest/var/executors/cobalt.yarrow/before_script.sh - hostname - source /home/shahzebsiddiqui/buildtest/var/executors/cobalt.yarrow/after_script.sh - -When job starts, Cobalt will write a cobalt log file ``.cobaltlog`` which -is provided by scheduler for troubleshooting. The output and error file are generated -once job finishes. Cobalt job progresses through job state ``starting`` --> ``pending`` --> ``running`` --> ``exiting``. -buildtest will capture Cobalt job details using ``qstat -lf `` and this -is updated in the report file. - -buildtest will poll job at set interval, where we run ``qstat --header State `` to -check state of job, if job is finished then we gather results. Once job is finished, -qstat will not be able to poll job this causes an issue where buildtest can't poll -job since qstat will not return anything. This is a transient issue depending on when -you poll job, generally at ALCF qstat will not report existing job within 30sec after -job is terminated. buildtest will assume if it's able to poll job and is in `exiting` -stage that job is complete, if its unable to retrieve this state we check for -output and error file. If file exists we assume job is complete and buildtest will -gather the results. - -buildtest will determine exit code by parsing cobalt log file, the file contains a line -such as :: - - Thu Nov 05 17:29:30 2020 +0000 (UTC) Info: task completed normally with an exit code of 0; initiating job cleanup and removal - -qstat has no job record for capturing returncode so buildtest must rely on Cobalt Log file. - .. _max_pend_time: Jobs exceeds `max_pend_time` diff --git a/docs/buildspecs/spack.rst b/docs/buildspecs/spack.rst index a15d4ef0c..7b1a3ce1f 100644 --- a/docs/buildspecs/spack.rst +++ b/docs/buildspecs/spack.rst @@ -343,7 +343,7 @@ Specifying Scheduler Directives --------------------------------- The spack schema supports all of the :ref:`scheduler scheduler directives ` such -as ``sbatch``, ``bsub``, ``pbs``, ``cobalt``, and ``batch`` property in the buildspec. +as ``sbatch``, ``bsub`` and ``pbs`` property in the buildspec. The directives are applied at top of script. Shown below is a toy example that will define directives using **sbatch** property. Note, this test won't submit job to scheduler diff --git a/docs/configuring_buildtest/overview.rst b/docs/configuring_buildtest/overview.rst index a0463a3be..e0d871bb7 100644 --- a/docs/configuring_buildtest/overview.rst +++ b/docs/configuring_buildtest/overview.rst @@ -157,9 +157,9 @@ Specify directory paths to search for binaries The ``paths`` property can be used to search for binaries for batch schedulers. If your scheduler binaries are installed in a non-standard location that is not in $PATH, you can use this to specify the directory path. -In example below we will, we will specify directories for SLURM, LSF, PBS, TORQUE, and COBALT binaries that +In example below we will, we will specify directories for SLURM, LSF, PBS and TORQUE binaries that are not in $PATH and installed in `/usr/local/slurm/bin`, `/usr/local/lsf/bin`, -`/usr/local/pbs/bin`, `/usr/local/torque/bin`, `/usr/local/cobalt/bin` respectively. +`/usr/local/pbs/bin`, `/usr/local/torque/bin` respectively. .. code-block:: yaml @@ -168,7 +168,6 @@ are not in $PATH and installed in `/usr/local/slurm/bin`, `/usr/local/lsf/bin`, lsf: /usr/local/lsf/bin pbs: /usr/local/pbs/bin torque: /usr/local/torque/bin - cobalt: /usr/local/cobalt/bin Buildspec Cache diff --git a/docs/configuring_buildtest/site_examples.rst b/docs/configuring_buildtest/site_examples.rst index ff18e7fe2..e80539d7a 100644 --- a/docs/configuring_buildtest/site_examples.rst +++ b/docs/configuring_buildtest/site_examples.rst @@ -35,18 +35,3 @@ customized to each site but and can be changed in the configuration file or over .. literalinclude:: ../../tests/settings/summit.yml :language: yaml :emphasize-lines: 2-5,19-23,37-43 - -Argonne National Laboratory ---------------------------- - -`Joint Laboratory for System Evaluation (JLSE) `_ provides -a testbed of emerging HPC systems, the default scheduler is Cobalt, this is -defined in the ``cobalt`` section defined in the executor field. - -We set default launcher to qsub defined with ``launcher: qsub``. This is inherited -for all batch executors. In each cobalt executor the ``queue`` property will specify -the queue name to submit job, for instance the executor ``yarrow`` with ``queue: yarrow`` -will submit job using ``qsub -q yarrow`` when using this executor. - -.. literalinclude:: ../../tests/settings/jlse.yml - :language: yaml \ No newline at end of file diff --git a/docs/contributing/regression_testing.rst b/docs/contributing/regression_testing.rst index 11d0b85c8..7e8263d30 100644 --- a/docs/contributing/regression_testing.rst +++ b/docs/contributing/regression_testing.rst @@ -197,7 +197,6 @@ and link to coverage results which can be viewed in your browser. In next exampl buildtest/cli/debugreport.py 18 12 2 0 30.00% buildtest/cli/compilers.py 122 83 56 3 26.97% buildtest/executors/pbs.py 125 92 18 0 25.87% - buildtest/executors/cobalt.py 149 112 24 0 23.70% buildtest/log.py 20 15 2 0 22.73% buildtest/executors/slurm.py 153 117 34 0 21.39% buildtest/executors/local.py 51 40 10 0 21.31% diff --git a/docs/writing_buildspecs/multi_executor.rst b/docs/writing_buildspecs/multi_executor.rst index b679c9b0c..668b13c98 100644 --- a/docs/writing_buildspecs/multi_executor.rst +++ b/docs/writing_buildspecs/multi_executor.rst @@ -30,7 +30,7 @@ Multiple Executors The ``executors`` property can be used to define executor specific configuration for each test, currently this field can be used with :ref:`vars `, :ref:`env ` -, scheduler directives: ``sbatch``, ``bsub``, ``pbs``, ``cobalt`` and :ref:`cray burst buffer/data warp `. +, scheduler directives: ``sbatch``, ``bsub``, ``pbs``, and :ref:`cray burst buffer/data warp `. The ``executors`` field is a JSON object that expects name of executor followed by property set per executor. In this next example, we define variables ``X``, ``Y`` and environment ``SHELL`` based on executors **generic.local.sh** and **generic.local.bash**. From ac75a13088ccc16453ae6ed750f11e555afe3edc Mon Sep 17 00:00:00 2001 From: Shahzeb Siddiqui Date: Thu, 9 May 2024 15:50:41 -0400 Subject: [PATCH 4/5] remove jlse CI/CD pipeline --- .gitlab/jlse.yml | 45 --------------------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 .gitlab/jlse.yml diff --git a/.gitlab/jlse.yml b/.gitlab/jlse.yml deleted file mode 100644 index 0967922cc..000000000 --- a/.gitlab/jlse.yml +++ /dev/null @@ -1,45 +0,0 @@ -# This pipeline is run at ALCF at gitlab instance: https://gitlab.jlse.anl.gov - -stages: - - sync - - regression - -sync_buildtest_jlse_mirror: - tags: [shell, jlse] - stage: sync - only: - refs: - - schedules - script: - - cd $CI_PROJECT_DIR - - mkdir tmp && cd tmp - - git clone https://github.com/buildtesters/buildtest - - cd buildtest - - git remote add jlse_mirror git@gitlab-server-jlse-01.jlse.anl.gov:e4s/shahzebsiddiqui/buildtest.git - - git remote -v - - git branch - - git push jlse_mirror devel - - -jlse_pr_regression_test: - tags: [shell, jlse] - stage: regression - rules: - - if: '$CI_PIPELINE_SOURCE == "push" || $CI_PIPELINE_SOURCE == "web"' - when: always - script: - - whoami - - git branch - - ml conda - - python -m venv $CI_PROJECT_DIR/.pyenv - - source $CI_PROJECT_DIR/.pyenv/bin/activate - - source setup.sh - - pip install -r docs/requirements.txt - - python $BUILDTEST_ROOT/buildtest/tools/unittests.py -c - - curl -Os https://uploader.codecov.io/latest/linux/codecov - - chmod +x codecov - # CODECOV_TOKEN environment must be set, this value is stored in CI/CD variable at https://gitlab.jlse.anl.gov/e4s/shahzebsiddiqui/buildtest/-/settings/ci_cd - - ./codecov --verbose -t $CODECOV_TOKEN - - echo $? - - conda deactivate - - rm -rf $CI_PROJECT_DIR/.pyenv From 963ed84247dd2adafaada8652858876f01c65ece Mon Sep 17 00:00:00 2001 From: Shahzeb Siddiqui Date: Fri, 10 May 2024 10:59:22 -0400 Subject: [PATCH 5/5] fix yamllint issue --- .../schemas/examples/script.schema.json/valid/job_submission.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/buildtest/schemas/examples/script.schema.json/valid/job_submission.yml b/buildtest/schemas/examples/script.schema.json/valid/job_submission.yml index fbf2aff12..1d0d9919d 100644 --- a/buildtest/schemas/examples/script.schema.json/valid/job_submission.yml +++ b/buildtest/schemas/examples/script.schema.json/valid/job_submission.yml @@ -18,4 +18,3 @@ buildspecs: - "-W 00:30" - "-N 1" run: hostname -