diff --git a/.github/workflows/rebuild-100k.yml b/.github/workflows/rebuild-100k.yml index d245b662e..529a3526d 100644 --- a/.github/workflows/rebuild-100k.yml +++ b/.github/workflows/rebuild-100k.yml @@ -17,7 +17,7 @@ jobs: with: python-version: "3.10" - - name: Launch build + - name: Launch GISAID build run: | set -x @@ -31,27 +31,60 @@ jobs: --memory 31GiB \ . \ upload \ - --configfile nextstrain_profiles/100k/config.yaml \ + --configfile nextstrain_profiles/100k/config-gisaid.yaml \ --config "${config[@]}" \ --set-threads tree=8 \ - |& tee build-launch.log + |& tee build-launch-gisaid.log env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} + - name: Launch open build + run: | + set -x + + declare -a config + config+=(slack_token=$SLACK_TOKEN) + + nextstrain build \ + --aws-batch \ + --detach \ + --cpus 16 \ + --memory 31GiB \ + . \ + upload \ + --configfile nextstrain_profiles/100k/config-open.yaml \ + --config "${config[@]}" \ + --set-threads tree=8 \ + |& tee build-launch-open.log + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} + + - name: Build info run: | - echo "--> 100k sample rebuilding on AWS" + echo "--> 100k samples for GISAID + Open data rebuilding (using separate AWS jobs)" echo - echo "--> When completed, the following 2 files will be updated:" + echo "--> When completed, the following files will be updated:" + echo "s3://nextstrain-data/files/ncov/open/100k/metadata.tsv.xz" + echo "s3://nextstrain-data/files/ncov/open/100k/sequences.fasta.xz" echo "s3://nextstrain-ncov-private/100k/metadata.tsv.xz" echo "s3://nextstrain-ncov-private/100k/sequences.fasta.xz" echo - echo "--> You can attach to this AWS job via:" - tail -n1 build-launch.log + echo "--> You can attach to the GISAID AWS job via:" + tail -n1 build-launch-gisaid.log + echo + echo "--> You can attach to the Open AWS job via:" + tail -n1 build-launch-open.log + echo + JOBID=$( tail -n1 build-launch-gisaid.log | sed -E 's/.+attach ([-a-f0-9]+).+/\1/' ) + echo "--> View the GISAID job in the AWS console via" + echo " https://console.aws.amazon.com/batch/home?region=us-east-1#jobs/detail/${JOBID}" echo - JOBID=$( tail -n1 build-launch.log | sed -E 's/.+attach ([-a-f0-9]+).+/\1/' ) - echo "--> View this job in the AWS console via" + JOBID=$( tail -n1 build-launch-open.log | sed -E 's/.+attach ([-a-f0-9]+).+/\1/' ) + echo "--> View the Open job in the AWS console via" echo " https://console.aws.amazon.com/batch/home?region=us-east-1#jobs/detail/${JOBID}" echo diff --git a/nextstrain_profiles/100k/README.md b/nextstrain_profiles/100k/README.md index 240406af1..b47e9ef93 100644 --- a/nextstrain_profiles/100k/README.md +++ b/nextstrain_profiles/100k/README.md @@ -1,21 +1,27 @@ ## Aim To build a representative 100k dataset which is available for testing / developing builds locally. -This is intended to run weekly via a GitHub action (which triggers a job to be run on AWS). -It will make two files available: +This is intended to run weekly via a GitHub action (which triggers jobs to be run on AWS). +It will upload these files: +* `s3://nextstrain-data/files/ncov/open/100k/metadata.tsv.xz` +* `s3://nextstrain-data/files/ncov/open/100k/sequences.fasta.xz` * `s3://nextstrain-ncov-private/100k/metadata.tsv.xz` * `s3://nextstrain-ncov-private/100k/sequences.fasta.xz` While this profile is not recommended to be run locally, you can see what rules would be run via: ``` -snakemake --cores 1 --configfile nextstrain_profiles/100k/config.yaml -npf upload --dag | dot -Tpdf > dag.pdf +snakemake --cores 1 --configfile nextstrain_profiles/100k/config-gisaid.yaml -npf upload --dag | dot -Tpdf > dag-100k-gisaid.pdf +snakemake --cores 1 --configfile nextstrain_profiles/100k/config-open.yaml -npf upload --dag | dot -Tpdf > dag-100k-open.pdf ``` -To run manually you can trigger the GitHub action or run the job locally via: +To run manually you can trigger the GitHub action (recommended) or run the jobs locally via: ``` nextstrain build --aws-batch --cpus 16 --memory 31GiB --detach . \ - --configfile nextstrain_profiles/100k/config.yaml \ + --configfile nextstrain_profiles/100k/config-gisaid.yaml \ + -f upload +nextstrain build --aws-batch --cpus 16 --memory 31GiB --detach . \ + --configfile nextstrain_profiles/100k/config-open.yaml \ -f upload ``` diff --git a/nextstrain_profiles/100k/config.yaml b/nextstrain_profiles/100k/config-gisaid.yaml similarity index 100% rename from nextstrain_profiles/100k/config.yaml rename to nextstrain_profiles/100k/config-gisaid.yaml diff --git a/nextstrain_profiles/100k/config-open.yaml b/nextstrain_profiles/100k/config-open.yaml new file mode 100644 index 000000000..0702c92ee --- /dev/null +++ b/nextstrain_profiles/100k/config-open.yaml @@ -0,0 +1,30 @@ +# This file is largely duplicated from `config-gisaid.yaml` - please +# see that file for comments +S3_DST_BUCKET: "nextstrain-data/files/ncov/open/100k" # TODO XXX +S3_DST_ORIGINS: [needed-for-workflow-but-unused] +deploy_url: needed_for_workflow_but_unused +custom_rules: + - workflow/snakemake_rules/export_for_nextstrain.smk +inputs: + - name: open + metadata: "s3://nextstrain-data/files/ncov/open/metadata.tsv.zst" + aligned: "s3://nextstrain-data/files/ncov/open/sequences.fasta.zst" + skip_sanitize_metadata: true +builds: + 100k: + subsampling_scheme: 100k_scheme +upload: + metadata.tsv.xz: results/100k/100k_subsampled_metadata.tsv.xz + sequences.fasta.xz: results/100k/100k_subsampled_sequences.fasta.xz +filter: + exclude_where: "division='USA'" +subsampling: + 100k_scheme: + 50k_early: + group_by: "year month country" + max_sequences: 50000 + max_date: "--max-date 1Y" + 50k_late: + group_by: "year month country" + max_sequences: 50000 + min_date: "--min-date 1Y"