Skip to content

Commit

Permalink
refactor: serotypes
Browse files Browse the repository at this point in the history
Since serotype is annotated as a column in metadata, simplify intermediate filenames like `data/sequences_{serotype}.fasta` and `data/metadata_{serotype}.tsv` to `data/sequences.fasta` and `data/metadata.tsv`.
  • Loading branch information
j23414 committed Dec 16, 2022
1 parent 370f6b8 commit fc12467
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 28 deletions.
6 changes: 3 additions & 3 deletions ingest/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ This is the ingest pipeline for Ebola virus sequences.
Fetch sequences with

```sh
nextstrain build --cpus 1 . data/sequences_all.ndjson
nextstrain build --cpus 1 . data/sequences.ndjson
```

Run the complete ingest pipeline with
Expand All @@ -21,8 +21,8 @@ nextstrain build --cpus 1 .

This will produce two files (within the `ingest` directory):

- data/metadata_all.tsv
- data/sequences_all.fasta
- data/metadata.tsv
- data/sequences.fasta

Run the complete ingest pipeline and upload results to AWS S3 with

Expand Down
2 changes: 1 addition & 1 deletion ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ serotypes = ['all']

def _get_all_targets(wildcards):
# Default targets are the metadata TSV and sequences FASTA files
all_targets = expand(["data/sequences_{serotype}.fasta.zst", "data/metadata_{serotype}.tsv.zst"], serotype=serotypes)
all_targets = ["data/sequences.fasta.zst", "data/metadata.tsv.zst"]

# Add additional targets based on upload config
upload_config = config.get("upload", {})
Expand Down
8 changes: 4 additions & 4 deletions ingest/config/optional.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ upload:
dst: 's3://nextstrain-data/files/workflows/ebola/test'
# Files to upload to S3 that are in the `data` directory
files_to_upload: [
'metadata_all.tsv.zst',
'sequences_all.fasta.zst',
'metadata.tsv.zst',
'sequences.fasta.zst',
]
# Remote file names for the files to upload, must be in the same order as local files above
remote_file_names: [
'metadata_all.tsv.zst',
'sequences_all.fasta.zst',
'metadata.tsv.zst',
'sequences.fasta.zst',
]
cloudfront_domain: 'data.nextstrain.org'

Expand Down
14 changes: 4 additions & 10 deletions ingest/workflow/snakemake_rules/fetch_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,11 @@ Produces final output as
"""

def download_serotype(w):
serotype = {
'all': '186536', # Returns 3530 records, check if we need a more specific Taxon ID
}
return serotype[w.serotype]

rule fetch_from_genbank:
output:
genbank_ndjson=temp("data/genbank_{serotype}.ndjson"),
genbank_ndjson=temp("data/genbank.ndjson"),
params:
serotype_tax_id=download_serotype,
serotype_tax_id='186536', # Returns 3530 records, check if we need a more specific Taxon ID
csv_to_ndjson_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/csv-to-ndjson",
fetch_from_genbank_url="https://raw.githubusercontent.com/nextstrain/dengue/new_ingest/ingest/bin/fetch-from-genbank",
genbank_url_url="https://raw.githubusercontent.com/nextstrain/dengue/new_ingest/ingest/bin/genbank-url", # Update if dengue merged
Expand All @@ -43,14 +37,14 @@ rule fetch_from_genbank:


def _get_all_sources(wildcards):
return [f"data/{source}_{wildcards.serotype}.ndjson" for source in config["sources"]]
return [f"data/{source}.ndjson" for source in config["sources"]]


rule fetch_all_sequences:
input:
all_sources=_get_all_sources,
output:
sequences_ndjson=temp("data/sequences_{serotype}.ndjson"),
sequences_ndjson=temp("data/sequences.ndjson"),
shell:
"""
cat {input.all_sources} > {output.sequences_ndjson}
Expand Down
20 changes: 10 additions & 10 deletions ingest/workflow/snakemake_rules/transform.smk
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@ rule concat_geolocation_rules:

rule transform:
input:
sequences_ndjson="data/sequences_{serotype}.ndjson",
sequences_ndjson="data/sequences.ndjson",
all_geolocation_rules="data/all-geolocation-rules.tsv",
output:
metadata=temp("data/raw_metadata_{serotype}.tsv"),
sequences="data/sequences_{serotype}.fasta",
metadata=temp("data/raw_metadata.tsv"),
sequences="data/sequences.fasta",
log:
"logs/transform_{serotype}.txt",
"logs/transform.txt",
params:
field_map=config["transform"]["field_map"],
strain_regex=config["transform"]["strain_regex"],
Expand Down Expand Up @@ -123,9 +123,9 @@ rule transform:

rule post_process_metadata:
input:
metadata="data/raw_metadata_{serotype}.tsv",
metadata="data/raw_metadata.tsv",
output:
metadata="data/metadata_{serotype}.tsv",
metadata="data/metadata.tsv",
params:
post_process_metadata_url="https://raw.githubusercontent.com/nextstrain/zika/ingest/ingest/bin/post_process_metadata.py",

Expand All @@ -144,11 +144,11 @@ rule post_process_metadata:

rule compress:
input:
sequences="data/sequences_{serotype}.fasta",
metadata="data/metadata_{serotype}.tsv",
sequences="data/sequences.fasta",
metadata="data/metadata.tsv",
output:
sequences="data/sequences_{serotype}.fasta.zst",
metadata="data/metadata_{serotype}.tsv.zst",
sequences="data/sequences.fasta.zst",
metadata="data/metadata.tsv.zst",
shell:
"""
zstd -T0 -o {output.sequences} {input.sequences}
Expand Down

0 comments on commit fc12467

Please sign in to comment.