From 639f448dcc8e35e3cf663bda28592e4fab965e82 Mon Sep 17 00:00:00 2001 From: j23414 Date: Wed, 30 Nov 2022 07:55:55 -0800 Subject: [PATCH] replace post processing Rscript with python * update help statement * make --outfile required * simplify reordering output columns * nuanced viruslineage_ids processing * when multiple paper urls, pick one --- ingest/bin/post_process_metadata.R | 53 -------- ingest/bin/post_process_metadata.py | 120 ++++++++++++++++++ ingest/workflow/snakemake_rules/transform.smk | 2 +- 3 files changed, 121 insertions(+), 54 deletions(-) delete mode 100755 ingest/bin/post_process_metadata.R create mode 100755 ingest/bin/post_process_metadata.py diff --git a/ingest/bin/post_process_metadata.R b/ingest/bin/post_process_metadata.R deleted file mode 100755 index a02cf353..00000000 --- a/ingest/bin/post_process_metadata.R +++ /dev/null @@ -1,53 +0,0 @@ -#! /usr/bin/env Rscript - -library(tidyverse) -library(magrittr) - -outfile="processed_metadata.tsv" -args <- commandArgs(trailingOnly = TRUE) -if (length(args) == 0) { - stop("Expecting one argument, a NCBI Virus metadata.tsv file.n", call. = FALSE) -} else if (length(args) == 1) { - metadata <- args[1] -} else if (length(args) == 2) { - metadata <- args[1] - outfile <- args[2] -} - -data <- readr::read_delim(metadata, - delim = "\t", - col_types = cols(.default = "c")) - -cdata <- data %>% - mutate( - url=paste("https://www.ncbi.nlm.nih.gov/nuccore/",accession,sep=""), - strain = case_when( - (strain != accession) ~ strain, - (accession == strain) & (!is.na(strain_s)) ~ strain_s, - TRUE ~ strain), - strain = strain %>% - gsub(" ", "_", .) %>% - gsub("-", "_", .) %>% - gsub("\\.", "_", .) %>% - gsub("\\(","_", .) %>% - gsub("\\)", "_", .), - strain_s=NULL, - serotype=case_when( - grepl("11053", viruslineage_ids) ~ "denv1", - grepl("11060", viruslineage_ids) ~ "denv2", - grepl("11069", viruslineage_ids) ~ "denv3", - grepl("11070", viruslineage_ids) ~ "denv4" - ), - viruslineage_ids = NULL, - authors = abbr_authors, - abbr_authors = NULL, - paper_url = case_when( - !is.na(publications) ~ paste("https://www.ncbi.nlm.nih.gov/pubmed/", publications, se = "") %>% gsub(",.*", "", .)), - publications = NULL, - city = location, - location = NULL - ) %>% - select(c("strain", "accession", "serotype", "date", "updated", "region", "country", "division", "city","authors", "url", "title", "paper_url")) - -cdata[is.na(cdata)] = "?" -readr::write_delim(cdata, outfile, delim = "\t") diff --git a/ingest/bin/post_process_metadata.py b/ingest/bin/post_process_metadata.py new file mode 100755 index 00000000..06fef215 --- /dev/null +++ b/ingest/bin/post_process_metadata.py @@ -0,0 +1,120 @@ +#! /usr/bin/env python3 + +"""Reformat pandas DataTables for a pathogen build. + +Expecting one argument, a NCBI Virus metadata.tsv file +""" +# ===== Dependencies +import argparse +import os +import sys + +import numpy as np +import pandas as pd + + +def parse_args(): + # Main help command + parser = argparse.ArgumentParser( + description="Reformat a NCBI Virus metadata.tsv file for a pathogen build." + ) + # Add first argument + parser.add_argument( + "--metadata", help="NCBI Virus metadata.tsv file.", required=True + ) + parser.add_argument( + "--outfile", + help="Output file name, e.g. processed_metadata.tsv.", + required=True, + ) + + return parser.parse_args() + + +# === Private methods + + +def _set_strain_name(record): + """Check Isolate_s and Strain_s to find the strain name""" + if record["strain"] != record["accession"]: + strain_name = record["strain"] + elif record["strain"] == record["accession"] and pd.notna(record["strain_s"]): + strain_name = record["strain_s"] + else: + strain_name = record["strain"] + + return ( + strain_name.replace(" ", "_") + .replace("-", "_") + .replace(".", "_") + .replace("(", "_") + .replace(")", "_") + ) + + +def _set_url(record): + """Set url column from accession""" + return "https://www.ncbi.nlm.nih.gov/nuccore/" + str(record["accession"]) + + +def _set_paper_url(record): + """Set paper_url from publication""" + if pd.notna(record["publications"]): + paper_url = "https://www.ncbi.nlm.nih.gov/pubmed/" + str(record["publications"]) + return paper_url.split(",")[0] + return "" + + +def _set_dengue_serotype(record): + """Set dengue serotype from viruslineage_ids""" + dengue_types = { + "11053": "denv1", + "11060": "denv2", + "11069": "denv3", + "11070": "denv4", + } + + ids = record["viruslineage_ids"].split(",") + + for taxid, serotype in dengue_types.items(): + if ids.count(taxid) > 0: + return serotype + + return "" + + +# === Main Method +def main(): + args = parse_args() + df = pd.read_csv(args.metadata, sep="\t", header=0) + + # Mutate commands + df["strain"] = df.apply(_set_strain_name, axis=1) + df["url"] = df.apply(_set_url, axis=1) + df["paper_url"] = df.apply(_set_paper_url, axis=1) + df["serotype"] = df.apply(_set_dengue_serotype, axis=1) + df["authors"] = df["abbr_authors"] + df["city"] = df["location"] + + # Format output + METADATA_COLUMNS = [ + "strain", + "accession", + "genbank_accession_rev", + "serotype", + "date", + "updated", + "region", + "country", + "division", + "city", + "authors", + "url", + "title", + "paper_url", + ] + df.to_csv(args.outfile, sep="\t", index=False, columns=METADATA_COLUMNS) + + +if __name__ == "__main__": + main() diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk index a557e7ca..7ada602e 100644 --- a/ingest/workflow/snakemake_rules/transform.smk +++ b/ingest/workflow/snakemake_rules/transform.smk @@ -128,7 +128,7 @@ rule post_process_metadata: metadata="data/metadata_{serotype}.tsv", shell: """ - ./bin/post_process_metadata.R {input.metadata} {output.metadata} + ./bin/post_process_metadata.py --metadata {input.metadata} --outfile {output.metadata} """ rule compress: