replace post processing Rscript with python

* update help statement * make --outfile required * simplify reordering output columns * nuanced viruslineage_ids processing * when multiple paper urls, pick one
nextstrain · Apr 17, 2023 · 639f448 · 639f448
1 parent 25408b6
commit 639f448
Show file tree

Hide file tree

Showing 3 changed files with 121 additions and 54 deletions.
diff --git a/ingest/bin/post_process_metadata.R b/ingest/bin/post_process_metadata.R
diff --git a/ingest/bin/post_process_metadata.py b/ingest/bin/post_process_metadata.py
@@ -0,0 +1,120 @@
+#! /usr/bin/env python3
+
+"""Reformat pandas DataTables for a pathogen build.
+
+Expecting one argument, a NCBI Virus metadata.tsv file
+"""
+# ===== Dependencies
+import argparse
+import os
+import sys
+
+import numpy as np
+import pandas as pd
+
+
+def parse_args():
+    # Main help command
+    parser = argparse.ArgumentParser(
+        description="Reformat a NCBI Virus metadata.tsv file for a pathogen build."
+    )
+    # Add first argument
+    parser.add_argument(
+        "--metadata", help="NCBI Virus metadata.tsv file.", required=True
+    )
+    parser.add_argument(
+        "--outfile",
+        help="Output file name, e.g. processed_metadata.tsv.",
+        required=True,
+    )
+
+    return parser.parse_args()
+
+
+# === Private methods
+
+
+def _set_strain_name(record):
+    """Check Isolate_s and Strain_s to find the strain name"""
+    if record["strain"] != record["accession"]:
+        strain_name = record["strain"]
+    elif record["strain"] == record["accession"] and pd.notna(record["strain_s"]):
+        strain_name = record["strain_s"]
+    else:
+        strain_name = record["strain"]
+
+    return (
+        strain_name.replace(" ", "_")
+        .replace("-", "_")
+        .replace(".", "_")
+        .replace("(", "_")
+        .replace(")", "_")
+    )
+
+
+def _set_url(record):
+    """Set url column from accession"""
+    return "https://www.ncbi.nlm.nih.gov/nuccore/" + str(record["accession"])
+
+
+def _set_paper_url(record):
+    """Set paper_url from publication"""
+    if pd.notna(record["publications"]):
+        paper_url = "https://www.ncbi.nlm.nih.gov/pubmed/" + str(record["publications"])
+        return paper_url.split(",")[0]
+    return ""
+
+
+def _set_dengue_serotype(record):
+    """Set dengue serotype from viruslineage_ids"""
+    dengue_types = {
+        "11053": "denv1",
+        "11060": "denv2",
+        "11069": "denv3",
+        "11070": "denv4",
+    }
+
+    ids = record["viruslineage_ids"].split(",")
+
+    for taxid, serotype in dengue_types.items():
+        if ids.count(taxid) > 0:
+            return serotype
+
+    return ""
+
+
+# === Main Method
+def main():
+    args = parse_args()
+    df = pd.read_csv(args.metadata, sep="\t", header=0)
+
+    # Mutate commands
+    df["strain"] = df.apply(_set_strain_name, axis=1)
+    df["url"] = df.apply(_set_url, axis=1)
+    df["paper_url"] = df.apply(_set_paper_url, axis=1)
+    df["serotype"] = df.apply(_set_dengue_serotype, axis=1)
+    df["authors"] = df["abbr_authors"]
+    df["city"] = df["location"]
+
+    # Format output
+    METADATA_COLUMNS = [
+        "strain",
+        "accession",
+        "genbank_accession_rev",
+        "serotype",
+        "date",
+        "updated",
+        "region",
+        "country",
+        "division",
+        "city",
+        "authors",
+        "url",
+        "title",
+        "paper_url",
+    ]
+    df.to_csv(args.outfile, sep="\t", index=False, columns=METADATA_COLUMNS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk
@@ -128,7 +128,7 @@ rule post_process_metadata:
         metadata="data/metadata_{serotype}.tsv",
     shell:
        """
-       ./bin/post_process_metadata.R {input.metadata} {output.metadata}
+       ./bin/post_process_metadata.py --metadata {input.metadata} --outfile {output.metadata}
        """
 
 rule compress: