From 639f448dcc8e35e3cf663bda28592e4fab965e82 Mon Sep 17 00:00:00 2001
From: j23414 <jenchang@iastate.edu>
Date: Wed, 30 Nov 2022 07:55:55 -0800
Subject: [PATCH] replace post processing Rscript with python

* update help statement
* make --outfile required
* simplify reordering output columns
* nuanced viruslineage_ids processing
* when multiple paper urls, pick one
---
 ingest/bin/post_process_metadata.R            |  53 --------
 ingest/bin/post_process_metadata.py           | 120 ++++++++++++++++++
 ingest/workflow/snakemake_rules/transform.smk |   2 +-
 3 files changed, 121 insertions(+), 54 deletions(-)
 delete mode 100755 ingest/bin/post_process_metadata.R
 create mode 100755 ingest/bin/post_process_metadata.py

diff --git a/ingest/bin/post_process_metadata.R b/ingest/bin/post_process_metadata.R
deleted file mode 100755
index a02cf353..00000000
--- a/ingest/bin/post_process_metadata.R
+++ /dev/null
@@ -1,53 +0,0 @@
-#! /usr/bin/env Rscript
-
-library(tidyverse)
-library(magrittr)
-
-outfile="processed_metadata.tsv"
-args <- commandArgs(trailingOnly = TRUE)
-if (length(args) == 0) {
-  stop("Expecting one argument, a NCBI Virus metadata.tsv file.n", call. = FALSE)
-} else if (length(args) == 1) {
-  metadata <- args[1]
-} else if (length(args) == 2) {
-  metadata <- args[1]
-  outfile <- args[2]
-}
-
-data <- readr::read_delim(metadata,
-                          delim = "\t",
-                          col_types = cols(.default = "c"))
-
-cdata <- data %>%
-  mutate(
-    url=paste("https://www.ncbi.nlm.nih.gov/nuccore/",accession,sep=""),
-    strain = case_when(
-      (strain != accession) ~ strain,
-      (accession == strain) & (!is.na(strain_s)) ~ strain_s,
-      TRUE ~ strain),
-    strain = strain %>%
-      gsub(" ", "_", .) %>%
-      gsub("-", "_", .) %>%
-      gsub("\\.", "_", .) %>%
-      gsub("\\(","_", .) %>%
-      gsub("\\)", "_", .),
-    strain_s=NULL,
-    serotype=case_when(
-      grepl("11053", viruslineage_ids) ~ "denv1",
-      grepl("11060", viruslineage_ids) ~ "denv2",
-      grepl("11069", viruslineage_ids) ~ "denv3",
-      grepl("11070", viruslineage_ids) ~ "denv4"
-      ),
-    viruslineage_ids = NULL,
-    authors = abbr_authors,
-    abbr_authors = NULL,
-    paper_url = case_when(
-      !is.na(publications) ~ paste("https://www.ncbi.nlm.nih.gov/pubmed/", publications, se = "") %>% gsub(",.*", "", .)),
-    publications = NULL,
-    city = location,
-    location = NULL
-  ) %>%
-  select(c("strain", "accession", "serotype", "date", "updated", "region", "country", "division", "city","authors", "url", "title", "paper_url"))
-
-cdata[is.na(cdata)] = "?"
-readr::write_delim(cdata, outfile, delim = "\t")
diff --git a/ingest/bin/post_process_metadata.py b/ingest/bin/post_process_metadata.py
new file mode 100755
index 00000000..06fef215
--- /dev/null
+++ b/ingest/bin/post_process_metadata.py
@@ -0,0 +1,120 @@
+#! /usr/bin/env python3
+
+"""Reformat pandas DataTables for a pathogen build.
+
+Expecting one argument, a NCBI Virus metadata.tsv file
+"""
+# ===== Dependencies
+import argparse
+import os
+import sys
+
+import numpy as np
+import pandas as pd
+
+
+def parse_args():
+    # Main help command
+    parser = argparse.ArgumentParser(
+        description="Reformat a NCBI Virus metadata.tsv file for a pathogen build."
+    )
+    # Add first argument
+    parser.add_argument(
+        "--metadata", help="NCBI Virus metadata.tsv file.", required=True
+    )
+    parser.add_argument(
+        "--outfile",
+        help="Output file name, e.g. processed_metadata.tsv.",
+        required=True,
+    )
+
+    return parser.parse_args()
+
+
+# === Private methods
+
+
+def _set_strain_name(record):
+    """Check Isolate_s and Strain_s to find the strain name"""
+    if record["strain"] != record["accession"]:
+        strain_name = record["strain"]
+    elif record["strain"] == record["accession"] and pd.notna(record["strain_s"]):
+        strain_name = record["strain_s"]
+    else:
+        strain_name = record["strain"]
+
+    return (
+        strain_name.replace(" ", "_")
+        .replace("-", "_")
+        .replace(".", "_")
+        .replace("(", "_")
+        .replace(")", "_")
+    )
+
+
+def _set_url(record):
+    """Set url column from accession"""
+    return "https://www.ncbi.nlm.nih.gov/nuccore/" + str(record["accession"])
+
+
+def _set_paper_url(record):
+    """Set paper_url from publication"""
+    if pd.notna(record["publications"]):
+        paper_url = "https://www.ncbi.nlm.nih.gov/pubmed/" + str(record["publications"])
+        return paper_url.split(",")[0]
+    return ""
+
+
+def _set_dengue_serotype(record):
+    """Set dengue serotype from viruslineage_ids"""
+    dengue_types = {
+        "11053": "denv1",
+        "11060": "denv2",
+        "11069": "denv3",
+        "11070": "denv4",
+    }
+
+    ids = record["viruslineage_ids"].split(",")
+
+    for taxid, serotype in dengue_types.items():
+        if ids.count(taxid) > 0:
+            return serotype
+
+    return ""
+
+
+# === Main Method
+def main():
+    args = parse_args()
+    df = pd.read_csv(args.metadata, sep="\t", header=0)
+
+    # Mutate commands
+    df["strain"] = df.apply(_set_strain_name, axis=1)
+    df["url"] = df.apply(_set_url, axis=1)
+    df["paper_url"] = df.apply(_set_paper_url, axis=1)
+    df["serotype"] = df.apply(_set_dengue_serotype, axis=1)
+    df["authors"] = df["abbr_authors"]
+    df["city"] = df["location"]
+
+    # Format output
+    METADATA_COLUMNS = [
+        "strain",
+        "accession",
+        "genbank_accession_rev",
+        "serotype",
+        "date",
+        "updated",
+        "region",
+        "country",
+        "division",
+        "city",
+        "authors",
+        "url",
+        "title",
+        "paper_url",
+    ]
+    df.to_csv(args.outfile, sep="\t", index=False, columns=METADATA_COLUMNS)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk
index a557e7ca..7ada602e 100644
--- a/ingest/workflow/snakemake_rules/transform.smk
+++ b/ingest/workflow/snakemake_rules/transform.smk
@@ -128,7 +128,7 @@ rule post_process_metadata:
         metadata="data/metadata_{serotype}.tsv",
     shell:
        """
-       ./bin/post_process_metadata.R {input.metadata} {output.metadata}
+       ./bin/post_process_metadata.py --metadata {input.metadata} --outfile {output.metadata}
        """
 
 rule compress: