-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathminiwdl_aws_studio.cfg
53 lines (47 loc) · 2.02 KB
/
miniwdl_aws_studio.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# miniwdl configuration file for use within SageMaker Studio, where the further necessary
# infrastructure will be detected contextually:
# 1. FSAP matching the Studio domain, EFS, and user profile
# 2. Batch queue with the tag MiniwdlStudioEfsId set to the Studio domain's EFS ID
#
# For info about where to place this file, and other available options:
# https://miniwdl.readthedocs.io/en/latest/runner_reference.html#configuration
[scheduler]
container_backend = aws_batch_job
# One `miniwdl run` process will be able to orchestrate this many concurrent AWS Batch jobs. (This
# controls the size of a thread pool, so setting it too high tends to be counterproductive.)
call_concurrency = 100
# Reduced concurrency limit for URI download jobs; since these are typically fast S3 downloads,
# running many concurrently is likely to overstress EFS.
download_concurrency = 10
[file_io]
# Run directories and all input files must reside within this shared directory
root = $HOME
[task_runtime]
# Default policy to retry spot-terminated jobs (up to three total attempts)
defaults = {
"docker": "ubuntu:20.04",
"preemptible": 2
}
[call_cache]
# Cache call outputs in EFS folder (valid so long as all referenced input & output files remain
# unmodified on EFS).
dir = $HOME/miniwdl/_CACHE/call
get = true
put = true
[download_cache]
dir = $HOME/miniwdl/_CACHE/download
get = true
put = false
# disable flock on files in use from download cache, due to low EFS limits on flocks
flock = false
[aws]
# Last-resort job timeout (seconds) for AWS Batch to enforce
job_timeout = 864000
# Internal rate limiting periods (seconds) for AWS Batch API requests
# (may need to be increased if many concurrent workflow runs are planned)
submit_period = 1
describe_period = 1
# Wait this many seconds before retrying a job after a spot instance interruption or other
# retryable failure. Provides a time window for convergence of any "eventually consistent"
# activities from the first attempt (involving e.g. EFS, CloudWatch Logs, etc.).
retry_wait = 60