Skip to content

Commit

Permalink
Use "accession" column as ID column directly
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Aug 14, 2023
1 parent 126c514 commit 9ea1d81
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 23 deletions.
51 changes: 31 additions & 20 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,18 @@ rule decompress:
zstd -d -c {input.metadata} > {output.metadata}
"""

rule wrangle_metadata:
input:
metadata="data/metadata_{serotype}.tsv",
output:
metadata="results/wrangled_metadata_{serotype}.tsv",
params:
strain_id=config.get("strain_id_field", "strain"), #accession
shell:
"""
csvtk -t rename -f strain -n strain_original {input.metadata} \
| csvtk -t mutate -f {params.strain_id} -n strain > {output.metadata}
"""
# rule wrangle_metadata:
# input:
# metadata="data/metadata_{serotype}.tsv",
# output:
# metadata="results/wrangled_metadata_{serotype}.tsv",
# params:
# strain_id=config.get("strain_id_field", "strain"), #accession
# shell:
# """
# csvtk -t rename -f strain -n strain_original {input.metadata} \
# | csvtk -t mutate -f {params.strain_id} -n strain > {output.metadata}
# """

rule filter:
"""
Expand All @@ -106,19 +106,21 @@ rule filter:
"""
input:
sequences = "data/sequences_{serotype}.fasta",
metadata = "results/wrangled_metadata_{serotype}.tsv",
metadata = "data/metadata_{serotype}.tsv",
exclude = files.dropped_strains
output:
sequences = "results/filtered_{serotype}.fasta"
params:
group_by = "year region",
sequences_per_group = filter_sequences_per_group,
min_length = 5000
min_length = 5000,
strain_id = config.get("strain_id_field", "strain"),
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id {params.strain_id} \
--exclude {input.exclude} \
--output {output.sequences} \
--group-by {params.group_by} \
Expand Down Expand Up @@ -175,20 +177,22 @@ rule refine:
input:
tree = "results/tree-raw_{serotype}.nwk",
alignment = "results/aligned_{serotype}.fasta",
metadata = "results/wrangled_metadata_{serotype}.tsv"
metadata = "data/metadata_{serotype}.tsv"
output:
tree = "results/tree_{serotype}.nwk",
node_data = "results/branch-lengths_{serotype}.json"
params:
coalescent = "const",
date_inference = "marginal",
clock_filter_iqd = 4
clock_filter_iqd = 4,
strain_id = config.get("strain_id_field", "strain"),
shell:
"""
augur refine \
--tree {input.tree} \
--alignment {input.alignment} \
--metadata {input.metadata} \
--metadata-id {params.strain_id} \
--output-tree {output.tree} \
--output-node-data {output.node_data} \
--timetree \
Expand Down Expand Up @@ -240,17 +244,19 @@ rule traits:
"""
input:
tree = "results/tree_{serotype}.nwk",
metadata = "results/wrangled_metadata_{serotype}.tsv"
metadata = "data/metadata_{serotype}.tsv"
output:
node_data = "results/traits_{serotype}.json",
params:
columns = traits_columns,
sampling_bias_correction = 3
sampling_bias_correction = 3,
strain_id = config.get("strain_id_field", "strain"),
shell:
"""
augur traits \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id {params.strain_id} \
--output {output.node_data} \
--columns {params.columns} \
--confidence \
Expand Down Expand Up @@ -279,7 +285,7 @@ rule export:
"""Exporting data files for for auspice"""
input:
tree = "results/tree_{serotype}.nwk",
metadata = "results/wrangled_metadata_{serotype}.tsv",
metadata = "data/metadata_{serotype}.tsv",
branch_lengths = "results/branch-lengths_{serotype}.json",
traits = "results/traits_{serotype}.json",
clades = "results/clades_{serotype}.json",
Expand All @@ -289,11 +295,14 @@ rule export:
output:
auspice_json = "results/raw_dengue_{serotype}.json",
root_sequence = "results/raw_dengue_{serotype}_root-sequence.json",
params:
strain_id = config.get("strain_id_field", "strain"),
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id {params.strain_id} \
--node-data {input.branch_lengths} {input.traits} {input.clades} {input.nt_muts} {input.aa_muts} \
--auspice-config {input.auspice_config} \
--include-root-sequence \
Expand All @@ -303,17 +312,19 @@ rule export:
rule final_strain_name:
input:
auspice_json="results/raw_dengue_{serotype}.json",
metadata="results/wrangled_metadata_{serotype}.tsv",
metadata="data/metadata_{serotype}.tsv",
root_sequence="results/raw_dengue_{serotype}_root-sequence.json",
output:
auspice_json="auspice/dengue_{serotype}.json",
root_sequence="auspice/dengue_{serotype}_root-sequence.json",
params:
display_strain_field=config.get("display_strain_field", "strain"),
strain_id=config.get("strain_id_field", "strain"),
shell:
"""
python3 bin/set_final_strain_name.py \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--input-auspice-json {input.auspice_json} \
--display-strain-name {params.display_strain_field} \
--output {output.auspice_json}
Expand Down
6 changes: 4 additions & 2 deletions bin/set_final_strain_name.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
import json, argparse
from augur.io import read_metadata

def replace_name_recursive(node, lookup):
if node["name"] in lookup:
Expand All @@ -17,14 +18,15 @@ def replace_name_recursive(node, lookup):

parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json")
parser.add_argument('--metadata', type=str, required=True, help="input data")
parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice")
parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
args = parser.parse_args()

metadata = pd.read_csv(args.metadata, sep='\t')
metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns)
name_lookup = {}
for ri, row in metadata.iterrows():
strain_id = row['strain']
strain_id = row.name
name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name]

with open(args.input_auspice_json, 'r') as fh:
Expand Down
2 changes: 1 addition & 1 deletion config/config_dengue.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
strain_id_field: "accession"
display_strain_field: "strain_original"
display_strain_field: "strain"

0 comments on commit 9ea1d81

Please sign in to comment.