Skip to content

Commit

Permalink
Refactor upload configs to match pathogen-repo-guide
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Mar 15, 2024
1 parent 85fae8f commit e9212f1
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 57 deletions.
37 changes: 19 additions & 18 deletions ingest/build-configs/nextstrain-automation/config.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
# Optional configs used by Nextstrain team
# Params for uploads
upload:
# Upload params for AWS S3
s3:
# AWS S3 Bucket with prefix
dst: 's3://nextstrain-data/files/workflows/zika'
# Mapping of files to upload, with key as remote file name and the value
# the local file path relative to the ingest directory.
files_to_upload:
genbank.ndjson.xz: data/genbank.ndjson
all_sequences.ndjson.xz: data/sequences.ndjson
metadata.tsv.gz: results/metadata.tsv
sequences.fasta.xz: results/sequences.fasta
alignment.fasta.xz: data/alignment.fasta
insertions.csv.gz: data/insertions.csv
translations.zip: data/translations.zip
# This configuration file should contain all required configuration parameters
# for the ingest workflow to run with additional Nextstrain automation rules.

cloudfront_domain: 'data.nextstrain.org'
# Custom rules to run as part of the Nextstrain automated workflow
# The paths should be relative to the ingest directory.
custom_rules:
- build-configs/nextstrain-automation/upload.smk

# Nextstrain CloudFront domain to ensure that we invalidate CloudFront after the S3 uploads
# This is required as long as we are using the AWS CLI for uploads
cloudfront_domain: "data.nextstrain.org"

# Nextstrain AWS S3 Bucket with pathogen prefix
# Replace <pathogen> with the pathogen repo name.
s3_dst: "s3://nextstrain-data/files/workflows/zika"

files_to_upload:
genbank.ndjson.zst: data/genbank.ndjson
metadata.tsv.zst: results/metadata.tsv
sequences.fasta.zst: results/sequences.fasta

# Toggle for Slack notifications
send_slack_notifications: True
Expand Down
61 changes: 22 additions & 39 deletions ingest/build-configs/nextstrain-automation/upload.smk
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
"""
This part of the workflow handles uploading files to a specified destination.
This part of the workflow handles uploading files to AWS S3.
Uses predefined wildcard `file_to_upload` determine input and predefined
wildcard `remote_file_name` as the remote file name in the specified destination.
Files to upload must be defined in the `files_to_upload` config param, where
the keys are the remote files and the values are the local filepaths
relative to the ingest directory.
Produces output files as `data/upload/{upload_target_name}/{remote_file_name}.done`.
Produces a single file for each uploaded file:
"results/upload/{remote_file}.upload"
Currently only supports uploads to AWS S3, but additional upload rules can
be easily added as long as they follow the output pattern described above.
The rule `upload_all` can be used as a target to upload all files.
"""
import os

Expand All @@ -17,48 +18,30 @@ send_notifications = (
)


def _get_upload_inputs(wildcards):
"""
If the file_to_upload has Slack notifications that depend on diffs with S3 files,
then we want the upload rule to run after the notification rule.
This function is mostly to keep track of which flag files to expect for
the rules in `slack_notifications.smk`, so it only includes flag files if
`send_notifications` is True.
"""
inputs = {
"file_to_upload": config["upload"]["s3"]["files_to_upload"][
wildcards.remote_file_name
],
}

if send_notifications:
flag_file = []

if file_to_upload == "data/genbank.ndjson":
flag_file = "data/notify/genbank-record-change.done"
elif file_to_upload == "results/metadata.tsv":
flag_file = "data/notify/metadata-diff.done"

inputs["notify_flag_file"] = flag_file

return inputs


rule upload_to_s3:
input:
unpack(_get_upload_inputs),
file_to_upload=lambda wildcards: config["files_to_upload"][wildcards.remote_file],
output:
"data/upload/s3/{remote_file_name}.done",
"results/upload/{remote_file}.upload",
params:
quiet="" if send_notifications else "--quiet",
s3_dst=config["upload"].get("s3", {}).get("dst", ""),
cloudfront_domain=config["upload"].get("s3", {}).get("cloudfront_domain", ""),
s3_dst=config["s3_dst"],
cloudfront_domain=config["cloudfront_domain"],
shell:
"""
./vendored/upload-to-s3 \
{params.quiet} \
{input.file_to_upload:q} \
{params.s3_dst:q}/{wildcards.remote_file_name:q} \
{params.s3_dst:q}/{wildcards.remote_file:q} \
{params.cloudfront_domain} 2>&1 | tee {output}
"""


rule upload_all:
input:
uploads=[
f"results/upload/{remote_file}.upload"
for remote_file in config["files_to_upload"].keys()
],
output:
touch("results/upload_all.done")

0 comments on commit e9212f1

Please sign in to comment.