From 7f9655389834b6ad544cda052f269fb038cc2cc0 Mon Sep 17 00:00:00 2001 From: j23414 Date: Thu, 29 Dec 2022 17:46:37 -0800 Subject: [PATCH] edit: Use a permalink for each script Use a permalink for each script to allow us to version the software we use in this workflow without being affected by upstream changes until we want to bump the version. This design adds more maintenance to this workflow, but it also protects users against unexpected issues that are outside of their control. Discussed in https://github.com/nextstrain/ebola/pull/6#discussion_r1048835183 Pick curl instead of wget as discussed in: https://github.com/nextstrain/ebola/pull/6#discussion_r1048835183 --- .../snakemake_rules/fetch_sequences.smk | 25 ++++--- .../snakemake_rules/slack_notifications.smk | 49 +++++++++----- ingest/workflow/snakemake_rules/transform.smk | 67 ++++++++++++------- .../snakemake_rules/trigger_rebuild.smk | 25 ++++--- ingest/workflow/snakemake_rules/upload.smk | 31 ++++++--- 5 files changed, 126 insertions(+), 71 deletions(-) diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk index d1c2173f..f6348509 100644 --- a/ingest/workflow/snakemake_rules/fetch_sequences.smk +++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk @@ -28,18 +28,25 @@ rule fetch_from_genbank: genbank_ndjson="data/genbank_{serotype}.ndjson", params: serotype_tax_id=download_serotype, - csv_to_ndjson_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/csv-to-ndjson", + csv_to_ndjson_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/csv-to-ndjson", shell: """ - if [[ ! -d bin ]]; then - mkdir bin - fi - if [[ ! -f bin/csv-to-ndjson ]]; then - cd bin - wget {params.csv_to_ndjson_url} - chmod 755 * - cd .. + # (1) Pick curl or wget based on availability + if which curl > /dev/null; then + download_cmd="curl -fsSL --output" + elif which wget > /dev/null; then + download_cmd="wget -O" + else + echo "ERROR: Neither curl nor wget found. Please install one of them." + exit 1 fi + + # (2) Download the required scripts if not already present + [[ -d bin ]] || mkdir bin + [[ -f bin/csv-to-ndjson ]] || $download_cmd bin/csv-to-ndjson {params.csv_to_ndjson_url} + chmod +x bin/* + + # (3) Fetch sequences from GenBank ./bin/fetch-from-genbank {params.serotype_tax_id} > {output.genbank_ndjson} """ diff --git a/ingest/workflow/snakemake_rules/slack_notifications.smk b/ingest/workflow/snakemake_rules/slack_notifications.smk index 638f84af..21dd3ccf 100644 --- a/ingest/workflow/snakemake_rules/slack_notifications.smk +++ b/ingest/workflow/snakemake_rules/slack_notifications.smk @@ -28,19 +28,25 @@ rule notify_on_genbank_record_change: touch("data/notify/genbank-record-change.done"), params: s3_src=S3_SRC, - notify_on_record_change_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/notify-on-record-change", + notify_on_record_change_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/notify-on-record-change", shell: """ - if [[ ! -d bin ]]; then - mkdir bin - fi - if [[ ! -f bin/notify-on-record-change ]]; then - cd bin - wget {params.notify_on_record_change_url} - chmod 755 - cd .. + # (1) Pick curl or wget based on availability + if which curl > /dev/null; then + download_cmd="curl -fsSL --output" + elif which wget > /dev/null; then + download_cmd="wget -O" + else + echo "ERROR: Neither curl nor wget found. Please install one of them." + exit 1 fi + # (2) Download the required scripts if not already present + [[ -d bin ]] || mkdir bin + [[ -f bin/notify-on-record-change ]] || $download_cmd bin/notify-on-record-change {params.notify_on_record_change_url} + chmod +x bin/* + + # (3) Run the script ./bin/notify-on-record-change {input.genbank_ndjson} {params.s3_src:q}/genbank.ndjson.xz Genbank """ @@ -52,18 +58,25 @@ rule notify_on_metadata_diff: touch("data/notify/metadata-diff.done"), params: s3_src=S3_SRC, - notify_on_diff_url = "https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/notify-on-diff", + notify_on_diff_url = "https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/notify-on-diff", shell: """ - if [[ ! -d bin ]]; then - mkdir bin - fi - if [[ ! -f bin/notify-on-diff ]]; then - cd bin - wget {params.notify_on_diff_url} - chmod 755 - cd .. + # (1) Pick curl or wget based on availability + if which curl > /dev/null; then + download_cmd="curl -fsSL --output" + elif which wget > /dev/null; then + download_cmd="wget -O" + else + echo "ERROR: Neither curl nor wget found. Please install one of them." + exit 1 fi + + # (2) Download the required scripts if not already present + [[ -d bin ]] || mkdir bin + [[ -f bin/notify-on-diff ]] || $download_cmd bin/notify-on-diff {params.notify_on_diff_url} + chmod +x bin/* + + # (3) Run the script ./bin/notify-on-diff {input.metadata} {params.s3_src:q}/metadata.tsv.gz """ diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk index 7ada602e..0a1ded5b 100644 --- a/ingest/workflow/snakemake_rules/transform.smk +++ b/ingest/workflow/snakemake_rules/transform.smk @@ -20,7 +20,18 @@ rule fetch_general_geolocation_rules: geolocation_rules_url=config["transform"]["geolocation_rules_url"], shell: """ - curl {params.geolocation_rules_url} > {output.general_geolocation_rules} + # (1) Pick curl or wget based on availability + if which curl > /dev/null; then + download_cmd="curl -fsSL --output" + elif which wget > /dev/null; then + download_cmd="wget -O" + else + echo "ERROR: Neither curl nor wget found. Please install one of them." + exit 1 + fi + + # (2) Fetch general geolocation rules + $download_cmd {output.general_geolocation_rules} {params.geolocation_rules_url} """ @@ -62,33 +73,41 @@ rule transform: metadata_columns=config["transform"]["metadata_columns"], id_field=config["transform"]["id_field"], sequence_field=config["transform"]["sequence_field"], - transform_field_names_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/transform-field-names", - transform_string_fields_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/transform-string-fields", - transform_strain_names_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/transform-strain-names", - transform_date_fields_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/transform-date-fields", - transform_genbank_location_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/transform-genbank-location", - transform_authors_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/transform-authors", - apply_geolocation_rules_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/apply-geolocation-rules", - merge_user_metadata_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/merge-user-metadata", - ndjson_to_tsv_and_fasta_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/ndjson-to-tsv-and-fasta", + transform_field_names_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/transform-field-names", + transform_string_fields_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/transform-string-fields", + transform_strain_names_url="https://raw.githubusercontent.com/nextstrain/monkeypox/b54768ec17872eb0d898e29527785642f6b98c0d/ingest/bin/transform-strain-names", + transform_date_fields_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/transform-date-fields", + transform_genbank_location_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/transform-genbank-location", + transform_authors_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/transform-authors", + apply_geolocation_rules_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/apply-geolocation-rules", + merge_user_metadata_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/merge-user-metadata", + ndjson_to_tsv_and_fasta_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/ndjson-to-tsv-and-fasta", shell: """ - if [[ ! -d bin ]]; then - mkdir bin + # (1) Pick curl or wget based on availability + if which curl > /dev/null; then + download_cmd="curl -fsSL --output" + elif which wget > /dev/null; then + download_cmd="wget -O" + else + echo "ERROR: Neither curl nor wget found. Please install one of them." + exit 1 fi - cd bin - [[ -f transform-field-names ]] || wget {params.transform_field_names_url} - [[ -f transform-string-fields ]] || wget {params.transform_string_fields_url} - [[ -f transform-strain-names ]] || wget {params.transform_strain_names_url} - [[ -f transform-date-fields ]] || wget {params.transform_date_fields_url} - [[ -f transform-genbank-location ]] || wget {params.transform_genbank_location_url} - [[ -f transform-authors ]] || wget {params.transform_authors_url} - [[ -f apply-geolocation-rules ]] || wget {params.apply_geolocation_rules_url} - [[ -f merge-user-metadata ]] || wget {params.merge_user_metadata_url} - [[ -f ndjson-to-tsv-and-fasta ]] || wget {params.ndjson_to_tsv_and_fasta_url} - chmod 755 * - cd .. + # (2) Download the required scripts if not already present + [[ -d bin ]] || mkdir bin + [[ -f bin/transform-field-names ]] || $download_cmd bin/transform-field-names {params.transform_field_names_url} + [[ -f bin/transform-string-fields ]] || $download_cmd bin/transform-string-fields {params.transform_string_fields_url} + [[ -f bin/transform-strain-names ]] || $download_cmd bin/transform-strain-names {params.transform_strain_names_url} + [[ -f bin/transform-date-fields ]] || $download_cmd bin/transform-date-fields {params.transform_date_fields_url} + [[ -f bin/transform-genbank-location ]] || $download_cmd bin/transform-genbank-location {params.transform_genbank_location_url} + [[ -f bin/transform-authors ]] || $download_cmd bin/transform-authors {params.transform_authors_url} + [[ -f bin/apply-geolocation-rules ]] || $download_cmd bin/apply-geolocation-rules {params.apply_geolocation_rules_url} + [[ -f bin/merge-user-metadata ]] || $download_cmd bin/merge-user-metadata {params.merge_user_metadata_url} + [[ -f bin/ndjson-to-tsv-and-fasta ]] || $download_cmd bin/ndjson-to-tsv-and-fasta {params.ndjson_to_tsv_and_fasta_url} + chmod +x bin/* + + # (3) Transform the sequences (cat {input.sequences_ndjson} \ | ./bin/transform-field-names \ --field-map {params.field_map} \ diff --git a/ingest/workflow/snakemake_rules/trigger_rebuild.smk b/ingest/workflow/snakemake_rules/trigger_rebuild.smk index 6048dd81..122931e5 100644 --- a/ingest/workflow/snakemake_rules/trigger_rebuild.smk +++ b/ingest/workflow/snakemake_rules/trigger_rebuild.smk @@ -14,17 +14,24 @@ rule trigger_build: output: touch("data/trigger/rebuild.done") params: - trigger_on_new_data_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/trigger-on-new-data" + trigger_on_new_data_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/trigger-on-new-data" shell: """ - if [[ ! -d bin ]]; then - mkdir bin - fi - if [[ ! -f bin/trigger-on-new-data ]]; then - cd bin - wget {params.trigger_on_new_data_url} - chmod 755 * - cd .. + # (1) Pick curl or wget based on availability + if which curl > /dev/null; then + download_cmd="curl -fsSL --output" + elif which wget > /dev/null; then + download_cmd="wget -O" + else + echo "ERROR: Neither curl nor wget found. Please install one of them." + exit 1 fi + + # (2) Download the required scripts if not already present + [[ -d bin ]] || mkdir bin + [[ -f bin/trigger-on-new-data ]] || $download_cmd bin/trigger-on-new-data {params.trigger_on_new_data_url} + chmod +x bin/* + + # (3) Trigger the build ./bin/trigger-on-new-data {input.metadata_upload} {input.fasta_upload} """ diff --git a/ingest/workflow/snakemake_rules/upload.smk b/ingest/workflow/snakemake_rules/upload.smk index 95ad81e3..67b99d43 100644 --- a/ingest/workflow/snakemake_rules/upload.smk +++ b/ingest/workflow/snakemake_rules/upload.smk @@ -54,20 +54,29 @@ rule upload_to_s3: quiet="" if send_notifications else "--quiet", s3_dst=config["upload"].get("s3", {}).get("dst", ""), cloudfront_domain=config["upload"].get("s3", {}).get("cloudfront_domain", ""), - upload_to_s3_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/upload-to-s3", - sha256sum_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/sha256sum", - cloudfront_invalidate_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/cloudfront-invalidate" + upload_to_s3_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/upload-to-s3", + sha256sum_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/sha256sum", + cloudfront_invalidate_url="https://raw.githubusercontent.com/nextstrain/monkeypox/644d07ebe3fa5ded64d27d0964064fb722797c5d/ingest/bin/cloudfront-invalidate" shell: """ - if [[ ! -d bin ]]; then - mkdir bin + # (1) Pick curl or wget based on availability + if which curl > /dev/null; then + download_cmd="curl -fsSL --output" + elif which wget > /dev/null; then + download_cmd="wget -O" + else + echo "ERROR: Neither curl nor wget found. Please install one of them." + exit 1 fi - cd bin - [[ -f upload-to-s3 ]] || wget {params.upload_to_s3_url} - [[ -f sha256sum ]] || wget {params.sha256sum_url} - [[ -f cloudfront-invalidate ]] || wget {params.cloudfront_invalidate_url} - chmod 755 * - cd .. + + # (2) Download the required scripts if not already present + [[ -d bin ]] || mkdir bin + [[ -f bin/upload-to-s3 ]] || $download_cmd bin/upload-to-s3 {params.upload_to_s3_url} + [[ -f bin/sha256sum ]] || $download_cmd bin/sha256sum {params.sha256sum_url} + [[ -f bin/cloudfront-invalidate ]] || $download_cmd bin/cloudfront-invalidate {params.cloudfront_invalidate_url} + chmod +x bin/* + + # (3) Run the upload script ./bin/upload-to-s3 \ {params.quiet} \ {input.file_to_upload:q} \