Skip to content

Commit

Permalink
Remove ndjson scripts
Browse files Browse the repository at this point in the history
Brings the ingest repo in-line with the general
pattern of other pathogen ingests. See
<#9>
for more context.

Note that we no longer vendor ndjson scripts as
their functionality is now available via `augur
curate passthru` (sic); see
<nextstrain/ingest#28>
for more.

Closes <#9>
  • Loading branch information
jameshadfield committed Mar 5, 2024
1 parent e4aa322 commit ec796da
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 108 deletions.
71 changes: 38 additions & 33 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,21 +60,45 @@ rule parse_genbank:
input:
genbank = "data/entrez/genbank.with-reference.gb",
output:
sequences = "data/genbank.fasta",
metadata = "data/genbank.tsv",
ndjson = "data/genbank.ndjson"
shell:
"""
scripts/parse-genbank.py --genbank {input.genbank} \
--output-sequences {output.sequences} --output-metadata {output.metadata}
scripts/parse-genbank.py --input {input.genbank} --output {output.ndjson}
"""

rule re_circularise:
rule curate:
input:
metadata = "data/genbank.tsv",
sequences = "data/genbank.fasta",
ndjson = "data/genbank.ndjson"
output:
sequences = "data/genbank.recircular.fasta",
metadata = "data/genbank.recircular.tsv",
metadata = "data/curated-metadata.tsv",
sequences = "data/curated-sequences.fasta",
params:
metadata_columns = ['name', 'accesion', "strain_name", "date", "year", "region", "country", "host", "genotype_genbank", "subgenotype_genbank", \
"circularise", "circularise_shift_bp","clade_nextclade","QC_overall_score","QC_overall_status","total_substitutions","total_deletions", \
"total_insertions","total_frame_shifts","total_missing","alignment_score","coverage","QC_missing_data","QC_mixed_sites","QC_rare_mutations", \
"QC_frame_shifts","QC_stop_codons"]
shell:
# scripts/fix_country_field.py Modifies country entries in the NDJSON records from stdin to split on the ':' character and discard any content after.
# vendored/apply-geolocation-rules
# scripts/add-year.py adds "year" to NDJSON entries
"""
cat {input.ndjson} \
| scripts/fix_country_field.py \
| vendored/apply-geolocation-rules --geolocation-rules defaults/geoLocationRules.tsv \
| scripts/add-year.py \
| augur curate passthru \
--output-seq-field sequence --output-id-field accession \
--output-metadata {output.metadata} --output-fasta {output.sequences}
"""


rule recircularise:
input:
metadata = "data/curated-metadata.tsv",
sequences = "data/curated-sequences.fasta",
output:
sequences = "data/circularised.fasta",
metadata = "data/circularised.tsv",
params:
reference = config['reference_accession']
shell:
Expand All @@ -91,8 +115,8 @@ rule align_everything:
Note that the minimum seed match rate is specified in the dataset itself.
"""
input:
sequences = "data/genbank.recircular.fasta",
metadata = "data/genbank.recircular.tsv",
sequences = "data/circularised.fasta",
metadata = "data/circularised.tsv",
output:
# Note that these outputs require `--output-basename nextclade`
alignment = "data/nextclade/nextclade.aligned.fasta",
Expand All @@ -114,10 +138,10 @@ rule align_everything:

rule join_nextclade_metadata:
input:
metadata = "data/genbank.recircular.tsv",
metadata = "data/circularised.tsv",
nextclade = "data/nextclade/nextclade.tsv"
output:
metadata = "data/metadata.tsv",
metadata = "results/metadata.tsv",
summary = "data/metadata.summary.txt",
shell:
"""
Expand All @@ -126,28 +150,9 @@ rule join_nextclade_metadata:
--output {output.metadata} --summary {output.summary}
"""

rule transform_metadata:
input:
metadata = "data/metadata.tsv"
output:
metadata = "results/metadata.tsv"
params:
metadata_columns = ['name', 'accesion', "strain_name", "date", "year", "region", "country", "host", "genotype_genbank", "subgenotype_genbank", \
"circularise", "circularise_shift_bp","clade_nextclade","QC_overall_score","QC_overall_status","total_substitutions","total_deletions", \
"total_insertions","total_frame_shifts","total_missing","alignment_score","coverage","QC_missing_data","QC_mixed_sites","QC_rare_mutations", \
"QC_frame_shifts","QC_stop_codons"]
shell:
"""
scripts/tsv-to-ndjson.py < {input.metadata} |
scripts/fix_country_field.py |
vendored/apply-geolocation-rules --geolocation-rules defaults/geoLocationRules.tsv |
scripts/add-year.py |
scripts/ndjson-to-tsv.py --metadata-columns {params.metadata_columns} --metadata {output.metadata}
"""

rule copy_ingest_files:
input:
sequences = "data/genbank.recircular.fasta",
sequences = "data/circularised.fasta",
aligned = "data/nextclade/nextclade.aligned.fasta",
output:
sequences = "results/sequences.fasta",
Expand Down
45 changes: 0 additions & 45 deletions ingest/scripts/ndjson-to-tsv.py

This file was deleted.

25 changes: 12 additions & 13 deletions ingest/scripts/parse-genbank.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import dateutil.parser
from collections import defaultdict
import re
import csv
import json

class MissingDate(Exception):
pass
Expand Down Expand Up @@ -192,17 +192,16 @@ def parse_metadata(record):
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--genbank", help="Input genbank file")
parser.add_argument("--output-sequences")
parser.add_argument("--output-metadata")
parser.add_argument("--input", metavar="GENBANK", help="Input genbank file")
parser.add_argument("--output", metavar="NDJSON")
args = parser.parse_args()

n_records = 0
seq_records = []
seq_records = {}
exclude_counts = defaultdict(int)
metadata = {}

for record in SeqIO.parse(open(args.genbank,"r"), "genbank"):
for record in SeqIO.parse(open(args.input,"r"), "genbank"):
n_records+=1

if exclude_isolate(record):
Expand All @@ -215,16 +214,16 @@ def parse_metadata(record):
exclude_counts['missing date']+=1
continue

seq_records.append(SeqRecord(record.seq, id=record_metadata['accession'], description=''))
accession = record_metadata['accession']
seq_records[accession] = SeqRecord(record.seq, id=accession, description='')
metadata[record_metadata['accession']] = record_metadata

print(f"{n_records} records parsed. Excluded "+', '.join([f"n={v} ({k})"for k,v in exclude_counts.items()]))
summarise(metadata)

SeqIO.write(seq_records, args.output_sequences, "fasta")
with open(args.output_metadata, 'w') as fh:
writer = csv.writer(fh, delimiter='\t', quotechar='"')
header = list(next(iter(metadata.values())).keys())
writer.writerow(header)
metadata_keys = list(next(iter(metadata.values())).keys())
with open(args.output, 'w') as fh:
for d in metadata.values():
writer.writerow([d[k] for k in header])
row = {k: d[k] for k in metadata_keys}
row['sequence'] = str(seq_records[row['accession']].seq)
print(json.dumps(row), file=fh)
17 changes: 0 additions & 17 deletions ingest/scripts/tsv-to-ndjson.py

This file was deleted.

0 comments on commit ec796da

Please sign in to comment.