Skip to content

Commit

Permalink
replace post processing Rscript with python
Browse files Browse the repository at this point in the history
* update help statement
* make --outfile required
* simplify reordering output columns
* nuanced viruslineage_ids processing
* when multiple paper urls, pick one
  • Loading branch information
j23414 authored and j23414 committed Apr 17, 2023
1 parent 25408b6 commit 639f448
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 54 deletions.
53 changes: 0 additions & 53 deletions ingest/bin/post_process_metadata.R

This file was deleted.

120 changes: 120 additions & 0 deletions ingest/bin/post_process_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#! /usr/bin/env python3

"""Reformat pandas DataTables for a pathogen build.
Expecting one argument, a NCBI Virus metadata.tsv file
"""
# ===== Dependencies
import argparse
import os
import sys

import numpy as np
import pandas as pd


def parse_args():
# Main help command
parser = argparse.ArgumentParser(
description="Reformat a NCBI Virus metadata.tsv file for a pathogen build."
)
# Add first argument
parser.add_argument(
"--metadata", help="NCBI Virus metadata.tsv file.", required=True
)
parser.add_argument(
"--outfile",
help="Output file name, e.g. processed_metadata.tsv.",
required=True,
)

return parser.parse_args()


# === Private methods


def _set_strain_name(record):
"""Check Isolate_s and Strain_s to find the strain name"""
if record["strain"] != record["accession"]:
strain_name = record["strain"]
elif record["strain"] == record["accession"] and pd.notna(record["strain_s"]):
strain_name = record["strain_s"]
else:
strain_name = record["strain"]

return (
strain_name.replace(" ", "_")
.replace("-", "_")
.replace(".", "_")
.replace("(", "_")
.replace(")", "_")
)


def _set_url(record):
"""Set url column from accession"""
return "https://www.ncbi.nlm.nih.gov/nuccore/" + str(record["accession"])


def _set_paper_url(record):
"""Set paper_url from publication"""
if pd.notna(record["publications"]):
paper_url = "https://www.ncbi.nlm.nih.gov/pubmed/" + str(record["publications"])
return paper_url.split(",")[0]
return ""


def _set_dengue_serotype(record):
"""Set dengue serotype from viruslineage_ids"""
dengue_types = {
"11053": "denv1",
"11060": "denv2",
"11069": "denv3",
"11070": "denv4",
}

ids = record["viruslineage_ids"].split(",")

for taxid, serotype in dengue_types.items():
if ids.count(taxid) > 0:
return serotype

return ""


# === Main Method
def main():
args = parse_args()
df = pd.read_csv(args.metadata, sep="\t", header=0)

# Mutate commands
df["strain"] = df.apply(_set_strain_name, axis=1)
df["url"] = df.apply(_set_url, axis=1)
df["paper_url"] = df.apply(_set_paper_url, axis=1)
df["serotype"] = df.apply(_set_dengue_serotype, axis=1)
df["authors"] = df["abbr_authors"]
df["city"] = df["location"]

# Format output
METADATA_COLUMNS = [
"strain",
"accession",
"genbank_accession_rev",
"serotype",
"date",
"updated",
"region",
"country",
"division",
"city",
"authors",
"url",
"title",
"paper_url",
]
df.to_csv(args.outfile, sep="\t", index=False, columns=METADATA_COLUMNS)


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion ingest/workflow/snakemake_rules/transform.smk
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ rule post_process_metadata:
metadata="data/metadata_{serotype}.tsv",
shell:
"""
./bin/post_process_metadata.R {input.metadata} {output.metadata}
./bin/post_process_metadata.py --metadata {input.metadata} --outfile {output.metadata}
"""

rule compress:
Expand Down

0 comments on commit 639f448

Please sign in to comment.