Skip to content

Commit

Permalink
Adding a python utility for summarizing compute. (#769)
Browse files Browse the repository at this point in the history
Adds a utility that summarizes Rocoto database computational usage information.
  • Loading branch information
christinaholtNOAA authored May 16, 2022
1 parent 11dc291 commit 961300d
Showing 1 changed file with 187 additions and 0 deletions.
187 changes: 187 additions & 0 deletions tests/WE2E/create_WE2E_resource_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
'''
Generate a summary of resources used for the WE2E test suite.
Examples:
To print usage
python create_WE2E_resource_summary.py
python create_WE2E_resource_summary.py -h
To print a report for all the experiments in an experiment directory
python create_WE2E_resource_summary.py -e /path/to/expt_dir
To print a report for all the grid_* and nco_* experiments.
python create_WE2E_resource_summary.py -e /path/to/expt_dir \
-n 'grid*' 'nco*'
To compute a total estimated cost for all experiments on instances that are
$0.15 per core hour.
python create_WE2E_resource_summary.py -e /path/to/expt_dir -c $0.15
Information about the output summary.
- The core hours are an underestimate in many cases.
- Multiple tries are not captured.
- The use of a portion of a node or instance is not known. If the whole node
is used, but isn't reflected in the core count, the cores are not counted.
Partition information is not stored in the database, so mapping to a given
node type becomes ambiguous.
For example, jobs that request 4 nodes with 2 processors per node with an
--exclusive flag will underestimate the total core hour usage by a factor
of 20 when using a 40 processor node.
- When computing cost per job, it will also provide an underestimate for the
reasons listed above.
- Only one cost will be applied across all jobs. Rocoto jobs do not store
partition information in the job table, so was not included as an option here.
'''

import argparse
import glob
import os
import sys
import sqlite3

REPORT_WIDTH = 110

def parse_args(argv):


'''
Function maintains the arguments accepted by this script. Please see
Python's argparse documenation for more information about settings of each
argument.
'''

parser = argparse.ArgumentParser(
description="Generate a usage report for a set of SRW experiments."
)

parser.add_argument(
'-e', '--expt_path',
help='The path to the directory containing the experiment \
directories',
)
parser.add_argument(
'-n', '--expt_names',
default=['*'],
help='A list of experiments to generate the report for. Wildcards \
accepted by glob.glob may be used. If not provided, a report will be \
generated for all experiments in the expt_path that have a Rocoto \
database',
nargs='*',
)

# Optional
parser.add_argument(
'-c', '--cost_per_core_hour',
help='Provide the cost per core hour for the instance type used. \
Only supports homogenous clusters.',
type=float,
)

return parser.parse_args(argv)

def get_workflow_info(db_path):

''' Given the path to a Rocoto database, return the total number of tasks,
core hours and wall time for the workflow. '''

con = sqlite3.connect(db_path)
cur = con.cursor()

# jobs schema is:
# (id INTEGER PRIMARY KEY, jobid VARCHAR(64), taskname VARCHAR(64), cycle
# DATETIME, cores INTEGER, state VARCHAR(64), native_state VARCHAR[64],
# exit_status INTEGER, tries INTEGER, nunknowns INTEGER, duration REAL)
#
# an example:
# 5|66993580|make_sfc_climo|1597017600|48|SUCCEEDED|COMPLETED|0|1|0|83.0
try:
cur.execute('SELECT cores, duration from jobs')
except sqlite3.OperationalError:
return 0, 0, 0

workflow_info = cur.fetchall()

core_hours = 0
wall_time = 0
ntasks = 0
for cores, duration in workflow_info:
core_hours += cores * duration / 3600
wall_time += duration / 60
ntasks += 1

return ntasks, core_hours, wall_time


def fetch_expt_summaries(expts):

''' Get the important information from the database of each experiment, and
return a list, sorted by experiment name. '''

summaries = []
for expt in expts:
test_name = expt.split('/')[-1]
db_path = os.path.join(expt, 'FV3LAM_wflow.db')
if not os.path.exists(db_path):
print(f'No FV3LAM_wflow.db exists for expt: {test_name}')
continue
ntasks, core_hours, wall_time = get_workflow_info(db_path)
summaries.append((test_name, ntasks, core_hours, wall_time))

return sorted(summaries)

def generate_report(argv):

''' Given user arguments, print a summary of the requested experiments'
usage information, including cost (if requested). '''

cla = parse_args(argv)

experiments = []
for expt in cla.expt_names:
experiments.extend(glob.glob(
os.path.join(cla.expt_path, expt)
))

header = f'{" "*60} Core Hours | Run Time (mins)'
if cla.cost_per_core_hour:
header = f'{header} | Est. Cost ($) '

print('-'*REPORT_WIDTH)
print('-'*REPORT_WIDTH)
print(header)
print('-'*REPORT_WIDTH)

total_ch = 0
total_cost = 0
for name, ntasks, ch, wt in fetch_expt_summaries(experiments):
line = f'{name[:60]:<60s} {ch:^12.2f} {wt:^20.1f}'
if cla.cost_per_core_hour:
cost = ch * cla.cost_per_core_hour
line = f'{line} ${cost:<.2f}'
total_cost += cost
total_ch += ch
print(line)

print('-'*REPORT_WIDTH)
print(f'TOTAL CORE HOURS: {total_ch:6.2f}')
if cla.cost_per_core_hour:
print(f'TOTAL COST: ${cla.cost_per_core_hour * total_ch:6.2f}')

print('*'*REPORT_WIDTH)
print('WARNING: This data reflects only the job information from the last',
'logged try. It does not account for the use \n of an entire node, only',
'the actual cores requested. It may provide an underestimate of true compute usage.')
print('*'*REPORT_WIDTH)


if __name__ == "__main__":
generate_report(sys.argv[1:])

0 comments on commit 961300d

Please sign in to comment.