diff --git a/Snakefile b/Snakefile index e3d8dd15..245107e6 100644 --- a/Snakefile +++ b/Snakefile @@ -83,18 +83,18 @@ rule decompress: zstd -d -c {input.metadata} > {output.metadata} """ -rule wrangle_metadata: - input: - metadata="data/metadata_{serotype}.tsv", - output: - metadata="results/wrangled_metadata_{serotype}.tsv", - params: - strain_id=config.get("strain_id_field", "strain"), #accession - shell: - """ - csvtk -t rename -f strain -n strain_original {input.metadata} \ - | csvtk -t mutate -f {params.strain_id} -n strain > {output.metadata} - """ +# rule wrangle_metadata: +# input: +# metadata="data/metadata_{serotype}.tsv", +# output: +# metadata="results/wrangled_metadata_{serotype}.tsv", +# params: +# strain_id=config.get("strain_id_field", "strain"), #accession +# shell: +# """ +# csvtk -t rename -f strain -n strain_original {input.metadata} \ +# | csvtk -t mutate -f {params.strain_id} -n strain > {output.metadata} +# """ rule filter: """ @@ -106,19 +106,21 @@ rule filter: """ input: sequences = "data/sequences_{serotype}.fasta", - metadata = "results/wrangled_metadata_{serotype}.tsv", + metadata = "data/metadata_{serotype}.tsv", exclude = files.dropped_strains output: sequences = "results/filtered_{serotype}.fasta" params: group_by = "year region", sequences_per_group = filter_sequences_per_group, - min_length = 5000 + min_length = 5000, + strain_id = config.get("strain_id_field", "strain"), shell: """ augur filter \ --sequences {input.sequences} \ --metadata {input.metadata} \ + --metadata-id {params.strain_id} \ --exclude {input.exclude} \ --output {output.sequences} \ --group-by {params.group_by} \ @@ -175,20 +177,22 @@ rule refine: input: tree = "results/tree-raw_{serotype}.nwk", alignment = "results/aligned_{serotype}.fasta", - metadata = "results/wrangled_metadata_{serotype}.tsv" + metadata = "data/metadata_{serotype}.tsv" output: tree = "results/tree_{serotype}.nwk", node_data = "results/branch-lengths_{serotype}.json" params: coalescent = "const", date_inference = "marginal", - clock_filter_iqd = 4 + clock_filter_iqd = 4, + strain_id = config.get("strain_id_field", "strain"), shell: """ augur refine \ --tree {input.tree} \ --alignment {input.alignment} \ --metadata {input.metadata} \ + --metadata-id {params.strain_id} \ --output-tree {output.tree} \ --output-node-data {output.node_data} \ --timetree \ @@ -240,17 +244,19 @@ rule traits: """ input: tree = "results/tree_{serotype}.nwk", - metadata = "results/wrangled_metadata_{serotype}.tsv" + metadata = "data/metadata_{serotype}.tsv" output: node_data = "results/traits_{serotype}.json", params: columns = traits_columns, - sampling_bias_correction = 3 + sampling_bias_correction = 3, + strain_id = config.get("strain_id_field", "strain"), shell: """ augur traits \ --tree {input.tree} \ --metadata {input.metadata} \ + --metadata-id {params.strain_id} \ --output {output.node_data} \ --columns {params.columns} \ --confidence \ @@ -279,7 +285,7 @@ rule export: """Exporting data files for for auspice""" input: tree = "results/tree_{serotype}.nwk", - metadata = "results/wrangled_metadata_{serotype}.tsv", + metadata = "data/metadata_{serotype}.tsv", branch_lengths = "results/branch-lengths_{serotype}.json", traits = "results/traits_{serotype}.json", clades = "results/clades_{serotype}.json", @@ -289,11 +295,14 @@ rule export: output: auspice_json = "results/raw_dengue_{serotype}.json", root_sequence = "results/raw_dengue_{serotype}_root-sequence.json", + params: + strain_id = config.get("strain_id_field", "strain"), shell: """ augur export v2 \ --tree {input.tree} \ --metadata {input.metadata} \ + --metadata-id {params.strain_id} \ --node-data {input.branch_lengths} {input.traits} {input.clades} {input.nt_muts} {input.aa_muts} \ --auspice-config {input.auspice_config} \ --include-root-sequence \ @@ -303,17 +312,19 @@ rule export: rule final_strain_name: input: auspice_json="results/raw_dengue_{serotype}.json", - metadata="results/wrangled_metadata_{serotype}.tsv", + metadata="data/metadata_{serotype}.tsv", root_sequence="results/raw_dengue_{serotype}_root-sequence.json", output: auspice_json="auspice/dengue_{serotype}.json", root_sequence="auspice/dengue_{serotype}_root-sequence.json", params: display_strain_field=config.get("display_strain_field", "strain"), + strain_id=config.get("strain_id_field", "strain"), shell: """ python3 bin/set_final_strain_name.py \ --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ --input-auspice-json {input.auspice_json} \ --display-strain-name {params.display_strain_field} \ --output {output.auspice_json} diff --git a/bin/set_final_strain_name.py b/bin/set_final_strain_name.py index 0036f2a5..08ca9359 100755 --- a/bin/set_final_strain_name.py +++ b/bin/set_final_strain_name.py @@ -1,5 +1,6 @@ import pandas as pd import json, argparse +from augur.io import read_metadata def replace_name_recursive(node, lookup): if node["name"] in lookup: @@ -17,14 +18,15 @@ def replace_name_recursive(node, lookup): parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json") parser.add_argument('--metadata', type=str, required=True, help="input data") + parser.add_argument('--metadata-id-columns', nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.") parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice") parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON") args = parser.parse_args() - metadata = pd.read_csv(args.metadata, sep='\t') + metadata = read_metadata(args.metadata, id_columns=args.metadata_id_columns) name_lookup = {} for ri, row in metadata.iterrows(): - strain_id = row['strain'] + strain_id = row.name name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name] with open(args.input_auspice_json, 'r') as fh: diff --git a/config/config_dengue.yaml b/config/config_dengue.yaml index aa8d0200..fa4e134a 100644 --- a/config/config_dengue.yaml +++ b/config/config_dengue.yaml @@ -1,2 +1,2 @@ strain_id_field: "accession" -display_strain_field: "strain_original" \ No newline at end of file +display_strain_field: "strain" \ No newline at end of file