nextstrain · jameshadfield · Jul 25, 2024 · Jul 8, 2024 · Jun 21, 2024 · Jun 27, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -11,4 +11,4 @@ jobs:
     # conform since this is a collaborative repo with an external group.
     uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@v0
     with:
-      build-args: test_target
+      build-args: --configfile config/gisaid.yaml -pf test_target
diff --git a/.github/workflows/phylogenetic-fauna.yaml b/.github/workflows/phylogenetic-fauna.yaml
@@ -25,6 +25,46 @@ on:
         type: string
 
 jobs:
+  summary:
+    runs-on: ubuntu-latest
+    steps:
+      - name: summary_step
+        run: |
+          URL_A="avian-flu/h5n1/ha/2y"
+          URL_B="avian-flu/h9n2/pb2/all-time"
+          if [[ "$TRIAL_NAME" ]]; then
+              echo "### Trial Build URLs" >> $GITHUB_STEP_SUMMARY
+              URL_BASE="https://nextstrain.org/staging/avian-flu/trials"
+              TRIAL_NAME_URL=$( echo "$TRIAL_NAME" | sed "s|_|/|g" )
+              echo "" >> $GITHUB_STEP_SUMMARY
+              echo "  * ${URL_BASE}/${TRIAL_NAME_URL}/${URL_A}" >> $GITHUB_STEP_SUMMARY
+              echo "  * ${URL_BASE}/${TRIAL_NAME_URL}/${URL_B}" >> $GITHUB_STEP_SUMMARY
+              echo "  * etc" >> $GITHUB_STEP_SUMMARY
+          else
+              echo "### Canonical URLs will be updated by this run" >> $GITHUB_STEP_SUMMARY
+              URL_BASE="https://nextstrain.org"
+              echo "" >> $GITHUB_STEP_SUMMARY
+              echo "  * ${URL_BASE}/${URL_A}" >> $GITHUB_STEP_SUMMARY
+              echo "  * ${URL_BASE}/${URL_B}" >> $GITHUB_STEP_SUMMARY
+              echo "  * etc" >> $GITHUB_STEP_SUMMARY
+          fi;
+          echo "" >> $GITHUB_STEP_SUMMARY
+          NOTE="NOTE: These URL paths may be incorrect if you've changed the  "
+          NOTE+="snakemake targets in 'config/gisaid.yaml' as part of this PR. "
+          NOTE+="Please update this GitHub Action if so!"
+          echo "> $NOTE" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "### Build (meta-)environment"  >> $GITHUB_STEP_SUMMARY
+          if [[ "$NEXTSTRAIN_DOCKER_IMAGE" ]]; then
+              echo "  * Docker image: $NEXTSTRAIN_DOCKER_IMAGE" >> $GITHUB_STEP_SUMMARY
+          else
+              echo "  * Docker image: default (latest)" >> $GITHUB_STEP_SUMMARY
+          fi;
+          echo '  * Git Branch: `'"${BRANCH_NAME}"'`' >> $GITHUB_STEP_SUMMARY
+    env:
+      NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }}
+      TRIAL_NAME: ${{ inputs.trial-name }}
+      BRANCH_NAME: ${{ github.head_ref || github.ref_name }} 
   phylogenetic:
     permissions:
       id-token: write
@@ -48,6 +88,7 @@ jobs:
           --memory 28800mib \
           . \
             deploy_all \
+            --configfile config/gisaid.yaml \
             --config "${config[@]}"
 
       env: |

diff --git a/.github/workflows/phylogenetic-ncbi.yaml b/.github/workflows/phylogenetic-ncbi.yaml
@@ -32,30 +32,70 @@ on:
         type: string
 
 jobs:
+  summary:
+    runs-on: ubuntu-latest
+    steps:
+      - name: summary_step
+        run: |
+          URL_A="avian-flu/h5n1-cattle-outbreak/genome"
+          URL_B="avian-flu/h5n1-cattle-outbreak/ha"
+          if [[ "$TRIAL_NAME" ]]; then
+              echo "### Trial Build URLs" >> $GITHUB_STEP_SUMMARY
+              URL_BASE="https://nextstrain.org/staging/avian-flu/trials"
+              TRIAL_NAME_URL=$( echo "$TRIAL_NAME" | sed "s|_|/|g" )
+              echo "" >> $GITHUB_STEP_SUMMARY
+              echo "  * ${URL_BASE}/${TRIAL_NAME_URL}/${URL_A}" >> $GITHUB_STEP_SUMMARY
+              echo "  * ${URL_BASE}/${TRIAL_NAME_URL}/${URL_B}" >> $GITHUB_STEP_SUMMARY
+              echo "  * etc" >> $GITHUB_STEP_SUMMARY
+          else
+              echo "### Canonical URLs will be updated by this run" >> $GITHUB_STEP_SUMMARY
+              URL_BASE="https://nextstrain.org"
+              echo "" >> $GITHUB_STEP_SUMMARY
+              echo "  * ${URL_BASE}/${URL_A}" >> $GITHUB_STEP_SUMMARY
+              echo "  * ${URL_BASE}/${URL_B}" >> $GITHUB_STEP_SUMMARY
+              echo "  * etc" >> $GITHUB_STEP_SUMMARY
+          fi;
+          echo "" >> $GITHUB_STEP_SUMMARY
+          NOTE="NOTE: These URL paths may be incorrect if you've changed the  "
+          NOTE+="snakemake targets in 'config/ncbi-cattle-outbreak.yaml' as part of this PR. "
+          NOTE+="Please update this GitHub Action if so!"
+          echo "> $NOTE" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "### Build (meta-)environment"  >> $GITHUB_STEP_SUMMARY
+          if [[ "$NEXTSTRAIN_DOCKER_IMAGE" ]]; then
+              echo "  * Docker image: $NEXTSTRAIN_DOCKER_IMAGE" >> $GITHUB_STEP_SUMMARY
+          else
+              echo "  * Docker image: default (latest)" >> $GITHUB_STEP_SUMMARY
+          fi;
+          echo '  * Git Branch: `'"${BRANCH_NAME}"'`' >> $GITHUB_STEP_SUMMARY
+    env:
+      NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image }}
+      TRIAL_NAME: ${{ inputs.trial-name }}
+      BRANCH_NAME: ${{ github.head_ref || github.ref_name }} 
   phylogenetic:
     permissions:
       id-token: write
     uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
     secrets: inherit
     with:
-      runtime: docker
+      runtime: aws-batch
       run: |
         declare -a config;
 
-        config+=(
-          s3_src="s3://nextstrain-data/files/workflows/avian-flu/h5n1"
-        );
-
         if [[ "$TRIAL_NAME" ]]; then
           config+=(
             deploy_url="s3://nextstrain-staging/avian-flu_trials_${TRIAL_NAME}_"
           )
         fi;
 
         nextstrain build \
+          --detach \
+          --no-download \
+          --cpus 16 \
+          --memory 28800mib \
           . \
             deploy_all \
-            --snakefile Snakefile.genome \
+            --configfile config/h5n1-cattle-outbreak.yaml \
             --config "${config[@]}"
 
       env: |

diff --git a/README.md b/README.md
@@ -1,17 +1,26 @@
 # nextstrain.org/avian-flu
 
-This is the Nextstrain build for avian influenza subtypes A/H5N1, A/H5NX, A/H7N9, and A/H9N2.
+This is the Nextstrain build for avian influenza subtypes A/H5N1, A/H5NX, A/H7N9, and A/H9N2 as well as analysis of the 2024 A/H5N1 cattle-flu outbreak.
 The most up-to-date builds of avian influenza can be found [on nextstrain.org](https://nextstrain.org/avian-flu).
 Please see [nextstrain.org/docs](https://nextstrain.org/docs) for details about augur and pathogen builds.
 
-## Building
+The Snakemake pipeline is parameterised by two config files, one for the A/H5N1, A/H5NX, A/H7N9, and A/H9N2 builds and one for the 2024 A/H5N1 cattle-flu outbreak.
+
+
+## Segment-level GISAID builds
+
+The `config/gisaid.yaml` config builds 32 Auspice datasets (8 segments x 4 subtypes (A/H5N1, A/H5NX, A/H7N9, A/H9N2)) using GISAID data and can be run via
+
+```bash
+snakemake --cores 1 -pf --configfile config/gisaid.yaml
+```
 
-All 32 builds (4 subtypes x 8 segments) can be built by running `snakemake`.
 This pipeline starts by downloading data from a private S3 bucket and the appropriate credentials are required; see below for how to use locally ingested files.
 For rapid AWS rebuild run as:
 
+
 ```bash
-nextstrain build --aws-batch --aws-batch-cpus 16 --aws-batch-memory 28800 . --jobs 16
+nextstrain build --aws-batch --aws-batch-cpus 16 --aws-batch-memory 28800 . --jobs 16 --configfile config/gisaid.yaml
 ```
 
 Please see [nextstrain.org/docs](https://nextstrain.org/docs) for details about augur and pathogen builds.
@@ -21,14 +30,44 @@ Please see [nextstrain.org/docs](https://nextstrain.org/docs) for details about
 The pipeline can automatically deploy resulting builds within the auspice folder
 to nextstrain.org by running:
 
+```bash
+nextstrain build . --configfile config/gisaid.yaml -f deploy_all
 ```
-nextstrain build . deploy_all
+
+## H5N1 Cattle Outbreak (2024)
+
+We produce per-segment and whole-genome (concatenated segments) builds for the ongoing H5N1 cattle-flu outbreak.
+These use NCBI data including consensus genomes and SRA data assembled via the Andersen lab's [avian-influenza repo](https://github.com/andersen-lab/avian-influenza).
+
+> Running this build will overwrite GISAID files in `./data` and thus you can't maintain or run GISAID & NCBI builds in parallel. In most cases this isn't an issue and [we are working on improving this](https://github.com/nextstrain/avian-flu/issues/70). You may want to proactively remove the `./data` directory yourself to make sure everything works as expected.
+
+
+```bash
+snakemake --cores 1 -pf --configfile config/h5n1-cattle-outbreak.yaml
 ```
 
+This pipeline starts by downloading data from a public S3 bucket, however credentials may still be required to interact with AWS S3 buckets.
+
+
+**Genome builds**
+
+The build is restricted to a set of strains where we think there's no reassortment, with outgroups excluded (`config/dropped_strains_h5n1-cattle-outbreak.txt`).
+Output files will be placed in `results/h5n1-cattle-outbreak/genome`.
+See `Snakefile.genome` for more details.
+
+
+**Segment-level builds**
+
+Strains for each segment are chosen by first constructing a general tree for the segment with all strains from 2024 onwards and then taking the clade which contains all strains in the genome build.
+This should allow any reassortments to be highlighted and will also include outbreak strains which are missing from the genome build (because they don't have all 8 segments sequenced).
+
+> Note that generating any segment-level build here will necessarily build the genome tree, as it's needed to identify the clade of interest in each segment.
+
+
 ## Creating a custom build
 The easiest way to generate your own, custom avian-flu build is to use the quickstart-build as a starting template. Simply clone the quickstart-build, run with the example data, and edit the Snakefile to customize. This build includes example data and a simplified, heavily annotated Snakefile that goes over the structure of Snakefiles and annotates rules and inputs/outputs that can be modified. This build, with it's own readme, is available [here](https://github.com/nextstrain/avian-flu/tree/master/quickstart-build).
 
-### Features unique to avian flu builds
+## Features unique to avian flu builds
 
 #### cleavage site annotations
 Influenza virus HA is translated as a single peptide (HA0) that is cleaved to form the mature, functional form (HA1 and HA2). In all human strains and many avian strains, the cleavage site is composed of a single, basic amino acid residue. However, some avian influenza subtypes, particularly H5s, have acquired additional basic residues immediately preceding the HA cleavage site. In some cases, this results in addition of a furin cleavage motif, allowing HA to be cleaved by furin, which is ubiquitously expressed, and allows for viral replication across a range of tissues. The addition of this "polybasic cleavage site" is one of the prime determinants of avian influenza virulence. In these builds, we have annotated whether strains contain a furin cleavage motif, defined here as the sequence `R-X-K/R-R` immediately preceding the start of HA2, where `X` can be any amino acid. We have also added a color by for the cleavage site sequence, which we define here as the 4 bases preceding HA2.
@@ -73,26 +112,6 @@ Run the pipeline with `--config 'local_ingest=True'` to use the locally availabl
 Specifically, the files needed are `ingest/results/metadata.tsv` and `ingest/results/sequences_{SEGMENT}.fasta`.
 
 
-#### Running full genome builds
-
-Run full genome builds with the following command.
-
-``` bash
-nextstrain build \
-    --env AWS_ACCESS_KEY_ID \
-    --env AWS_SECRET_ACCESS_KEY \
-    . \
-        --snakefile Snakefile.genome \
-        --config s3_src=s3://nextstrain-data/files/workflows/avian-flu/h5n1
-```
-
-Currently this is only set up for the "h5n1-cattle-outbreak" build using NCBI data,
-and the build is restricted to a set of strains where we think there's no reassortment, with outgroups
-excluded in (`config/dropped_strains_h5n1-cattle-outbreak.txt`).
-Output files will be placed in `results/h5n1-cattle-outbreak/genome`.
-See `Snakefile.genome` for more details.
-
-
 ### To modify this build to work with your own data
 Although the simplest way to generate a custom build is via the quickstart build, you are welcome to clone this repo and use it as a starting point for running your own, local builds if you'd like. The [Nextstrain docs](https://docs.nextstrain.org/en/latest/index.html) are a fantastic resource for getting started with the Nextstrain pipeline, and include some [great tutorials](https://docs.nextstrain.org/en/latest/install.html) to get you started. This build is slightly more complicated than other builds, and has a few custom functions in it to accommodate all the features on [nextstrain.org](https://nextstrain.org/avian-flu), and makes use of wildcards for both subtypes and gene segments. If you'd like to adapt this full, non-simplified pipeline here to your own data (which you may want to do if you also want to annotate clades), you would need to make a few changes and additions: