From f89998193048c25ffb779d0fb73190100ff77f19 Mon Sep 17 00:00:00 2001 From: mjjimenez <51485618+mj-jimenez@users.noreply.github.com> Date: Fri, 14 Jun 2024 11:39:49 +0200 Subject: [PATCH] Clean DB to unify salt names and correct some approval annotations --- data/synonyms.tsv | 203 +++++++++++++++++++++++++++++++++++++++ src/cleanTable.R | 237 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 440 insertions(+) create mode 100755 data/synonyms.tsv create mode 100644 src/cleanTable.R diff --git a/data/synonyms.tsv b/data/synonyms.tsv new file mode 100755 index 0000000..08a5aaa --- /dev/null +++ b/data/synonyms.tsv @@ -0,0 +1,203 @@ +show_drug_name new_show_drug_name +4-AMINOSALICYLIC ACID 4-AMINOSALICYLIC ACID +POTASSIUM 4-AMINOSALICYLATE 4-AMINOSALICYLIC ACID +ACAMPROSATE ACAMPROSATE +ACAMPROSATE CALCIUM ACAMPROSATE +AFATINIB AFATINIB +AFATINIB DIMALEATE AFATINIB +ALECTINIB ALECTINIB +ALECTINIB HYDROCHLORIDE ALECTINIB +AMANTADINE AMANTADINE +AMANTADINE HYDROCHLORIDE AMANTADINE +BALSALAZIDE BALSALAZIDE +BALSALAZIDE DISODIUM BALSALAZIDE +BAZEDOXIFENE BAZEDOXIFENE +BAZEDOXIFENE ACETATE BAZEDOXIFENE +BEVACIZUMAB BEVACIZUMAB +BEVACIZUMAB 111IN BEVACIZUMAB +BRENTUXIMAB BRENTUXIMAB +BRENTUXIMAB VEDOTIN BRENTUXIMAB +BROMFENAC BROMFENAC +BROMFENAC SODIUM BROMFENAC +CALCITONIN CALCITONIN +CALCITONIN [USAN] CALCITONIN +CHLORMETHINE CHLORMETHINE +MECHLORETHAMINE HYDROCHLORIDE CHLORMETHINE +CLOMIPHENE CLOMIFENE +CLOMIFENE CITRATE CLOMIFENE +COBIMETINIB COBIMETINIB +COBIMETINIB FUMARATE COBIMETINIB +DABRAFENIB DABRAFENIB +DABRAFENIB MESYLATE DABRAFENIB +DACOMITINIB DACOMITINIB +DACOMITINIB HYDRATE DACOMITINIB +DANTROLENE DANTROLENE +DANTROLENE SODIUM DANTROLENE +DAUNORUBICIN DAUNORUBICIN +DAUNORUBICIN HYDROCHLORIDE DAUNORUBICIN +DEXAMETHASONE DEXAMETHASONE +DEXAMETHASONE ACETATE DEXAMETHASONE +DEXAMETHASONE DIPROPIONATE DEXAMETHASONE +DEXAMETHASONE SODIUM PHOSPHATE DEXAMETHASONE +DEXAMETHASONE VALERATE DEXAMETHASONE +DEXAMETHASONE-SPERMINE DEXAMETHASONE +DICLOFENAC DICLOFENAC +DICLOFENAC EPOLAMINE DICLOFENAC +DICLOFENAC POTASSIUM DICLOFENAC +DICLOFENAC SODIUM DICLOFENAC +DOXORUBICIN DOXORUBICIN +DOXORUBICIN HYDROCHLORIDE DOXORUBICIN +ECONAZOLE ECONAZOLE +ECONAZOLE NITRATE ECONAZOLE +ELTROMBOPAG ELTROMBOPAG +ELTROMBOPAG OLAMINE ELTROMBOPAG +ERLOTINIB ERLOTINIB +ERLOTINIB HYDROCHLORIDE ERLOTINIB +ESKETAMINE ESKETAMINE +ESKETAMINE HYDROCHLORIDE ESKETAMINE +ESTRADIOL ACETATE ESTRADIOL +ESTRADIOL BENZOATE ESTRADIOL +ESTRADIOL DIENANTHATE ESTRADIOL +ESTRADIOL DIPROPIONATE ESTRADIOL +ESTRADIOL ENANTHATE ESTRADIOL +ESTRADIOL PHOSPHATE ESTRADIOL +ESTRADIOL PROPIONATE ESTRADIOL +ESTRADIOL VALERATE ESTRADIOL +ESTRADIOLUM ESTRADIOL +ESTRAMUSTINE ESTRAMUSTINE +ESTRAMUSTINE PHOSPHATE SODIUM ESTRAMUSTINE +ESTRAMUSTINE PHOSPHATE ESTRAMUSTINE +ESTRIOL ESTRIOL +ESTRIOL TRIPROPIONATE ESTRIOL +ESTRONE ESTRONE +ESTRONE 3-O-SULFAMATE ESTRONE +FENOPROFEN FENOPROFEN +FENOPROFEN CALCIUM FENOPROFEN +FLUOXETINE FLUOXETINE +FLUOXETINE HYDROCHLORIDE FLUOXETINE +FLURBIPROFEN FLURBIPROFEN +FLURBIPROFEN SODIUM FLURBIPROFEN +FLURBIPROFEN METHYL ESTER FLURBIPROFEN +FLUTICASONE FLUTICASONE +FLUTICASONE FUROATE FLUTICASONE +FLUTICASONE PROPIONATE FLUTICASONE +GABAPENTIN GABAPENTIN +GABAPENTIN ENACARBIL GABAPENTIN +GEMCITABINE GEMCITABINE +GEMCITABINE ELAIDATE GEMCITABINE +GEMCITABINE HYDROCHLORIDE GEMCITABINE +HYDRALAZINE HYDRALAZINE +HYDRALAZINE HYDROCHLORIDE HYDRALAZINE +HYDROXYPROGESTERONE HYDROXYPROGESTERONE +HYDROXYPROGESTERONE CAPROATE HYDROXYPROGESTERONE +IBUPROFEN IBUPROFEN +IBUPROFEN SODIUM IBUPROFEN +IBUPROFEN LYSINE IBUPROFEN +IDARUBICIN IDARUBICIN +IDARUBICIN HYDROCHLORIDE IDARUBICIN +INSULIN ASPART INSULIN ASPART +INSULIN ASPART PROTAMINE RECOMBINANT INSULIN ASPART +IRINOTECAN IRINOTECAN +IRINOTECAN HYDROCHLORIDE IRINOTECAN +IRINOTECAN HYDROCHLORIDE TRIHYDRATE IRINOTECAN +IRON IRON +FERROUS GLUCONATE IRON +FERROUS SUCCINATE IRON +FERROUS FUMARATE IRON +FERROUS GLYCINE SULFATE IRON +KETAMINE KETAMINE +KETAMINE HYDROCHLORIDE KETAMINE +KETOROLAC KETOROLAC +KETOROLAC TROMETHAMINE KETOROLAC +LAPATINIB LAPATINIB +LAPATINIB DITOSYLATE LAPATINIB +LENVATINIB LENVATINIB +LENVATINIB MESYLATE LENVATINIB +LEVOTHYROXINE LEVOTHYROXINE +LEVOTHYROXINE SODIUM LEVOTHYROXINE +LIOTHYRONINE LIOTHYRONINE +LIOTHYRONINE SODIUM LIOTHYRONINE +MEPACRINE MEPACRINE +QUINACRINE DIHYDROCHLORIDE MEPACRINE +MEDROXYPROGESTERONE MEDROXYPROGESTERONE +MEDROXYPROGESTERONE ACETATE MEDROXYPROGESTERONE +MEGESTROL MEGESTROL +MEGESTROL ACETATE MEGESTROL +MEMANTINE MEMANTINE +MEMANTINE HYDROCHLORIDE MEMANTINE +MIDAZOLAM MIDAZOLAM +MIDAZOLAM HYDROCHLORIDE MIDAZOLAM +MITOXANTRONE MITOXANTRONE +MITOXANTRONE HYDROCHLORIDE MITOXANTRONE +MOMETASONE MOMETASONE +MOMETASONE FUROATE MOMETASONE +NAPROXEN NAPROXEN +NAPROXEN ETEMESIL NAPROXEN +NAPROXEN SODIUM NAPROXEN +NINTEDANIB NINTEDANIB +NINTEDANIB ESYLATE NINTEDANIB +NERAMEXANE NERAMEXANE +NERAMEXANE MESYLATE NERAMEXANE +NORETHISTERONE NORETHISTERONE +NORETHISTERONE ACETATE NORETHISTERONE +PANOBINOSTAT PANOBINOSTAT +PANOBINOSTAT LACTATE PANOBINOSTAT +OBATOCLAX OBATOCLAX +OBATOCLAX MESYLATE OBATOCLAX +ORPHENADRINE ORPHENADRINE +ORPHENADRINE CITRATE ORPHENADRINE +PACLITAXEL PACLITAXEL +DHA-PACLITAXEL PACLITAXEL +PACLITAXEL POLIGLUMEX PACLITAXEL +PACLITAXEL TREVATIDE PACLITAXEL +PAROXETINE PAROXETINE +PAROXETINE HYDROCHLORIDE PAROXETINE +PAROXETINE MESYLATE PAROXETINE +PAZOPANIB PAZOPANIB +PAZOPANIB HYDROCHLORIDE PAZOPANIB +POTASSIUM POTASSIUM +POTASSIUM ACETATE POTASSIUM +POTASSIUM ION POTASSIUM +POTASSIUM SULFATE POTASSIUM +POTASSIUM CHLORIDE POTASSIUM +PROCHLORPERAZINE PROCHLORPERAZINE +PROCHLORPERAZINE DIMALEATE PROCHLORPERAZINE +PROCHLORPERAZINE EDISYLATE PROCHLORPERAZINE +PYRIDOXAL PYRIDOXAL +PYRIDOXAL PHOSPHATE PYRIDOXAL +RALOXIFENE RALOXIFENE +RALOXIFENE HYDROCHLORIDE RALOXIFENE +RIBOCICLIB RIBOCICLIB +RIBOCICLIB SUCCINATE RIBOCICLIB +RUXOLITINIB RUXOLITINIB +RUXOLITINIB PHOSPHATE RUXOLITINIB +SERTRALINE SERTRALINE +SERTRALINE HYDROCHLORIDE SERTRALINE +SORAFENIB SORAFENIB +SORAFENIB TOSYLATE SORAFENIB +SUNITINIB SUNITINIB +SUNITINIB MALATE SUNITINIB +TAMOXIFEN TAMOXIFEN +TAMOXIFEN CITRATE TAMOXIFEN +TIROFIBAN TIROFIBAN +TIROFIBAN HYDROCHLORIDE TIROFIBAN +TOFACITINIB TOFACITINIB +TOFACITINIB CITRATE TOFACITINIB +TOREMIFENE TOREMIFENE +TOREMIFENE CITRATE TOREMIFENE +TRAMETINIB TRAMETINIB +TRAMETINIB DIMETHYL SULFOXIDE TRAMETINIB +ULIPRISTAL ULIPRISTAL +ULIPRISTAL ACETATE ULIPRISTAL +VARESPLADIB VARESPLADIB +VARESPLADIB METHYL VARESPLADIB +VERAPAMIL VERAPAMIL +VERAPAMIL HYDROCHLORIDE VERAPAMIL +VINBLASTINE VINBLASTINE +VINBLASTINE SULFATE VINBLASTINE +VITAMIN E ACETATE VITAMIN E +VITAMIN E SUCCINATE VITAMIN E +ZINC ZINC +ZINC ACETATE ZINC +ZINC CHLORIDE ZINC +ZINC SULFATE ZINC diff --git a/src/cleanTable.R b/src/cleanTable.R new file mode 100644 index 0000000..6b4718c --- /dev/null +++ b/src/cleanTable.R @@ -0,0 +1,237 @@ +rm(list = ls()) # R version 4.3.1 (2023-06-16) +library(tidyverse) # tidyverse_2.0.0 +setwd(".") +out.dir <- "2.0/" + +# --- Data --- +# PanDrugs table +pandrugs <- + read.table(list.files(out.dir, pattern = "PanDrugs_.*_gs_dr.tsv", full.names = TRUE), + header = TRUE, sep = "\t", quote = "") + +# Salt synonyms +synonyms <- read.table("data/synonyms.tsv", header = TRUE, sep = "\t") + +# --- Code --- +# Select a reference synonym for standard_drug_name +synonyms <- synonyms %>% + mutate(reference = new_show_drug_name == show_drug_name) %>% + group_by(new_show_drug_name) %>% + mutate(has.reference = any(reference), + label = seq(n())) %>% + ungroup() %>% + mutate(reference = if_else(!has.reference & label == 1, TRUE, reference)) %>% + select(-c(has.reference, label)) + +# New PanDrugs table +pandrugs.new <- pandrugs + +# Standardise names +pandrugs.new <- pandrugs.new %>% + mutate(is.indatraline = show_drug_name == "(R,S)-INDATRALINE", + is.conjugatedestrogen = source_drug_name == "CONJUGATED ESTROGENS", + standard_drug_name = + case_when(is.indatraline ~ "INDATRALINE", + is.conjugatedestrogen ~ "CONJUGATED ESTROGENS", + TRUE ~ standard_drug_name), + show_drug_name = + case_when(is.indatraline ~ "INDATRALINE", + is.conjugatedestrogen ~ "CONJUGATED ESTROGENS", + TRUE ~ show_drug_name)) %>% + select(-is.indatraline, -is.conjugatedestrogen) + +# Merge synonyms +pandrugs.new <- pandrugs.new %>% + left_join(synonyms, by = "show_drug_name") %>% + rename(ID = new_show_drug_name) %>% + unique() + +# Collapse drug info +updated <- pandrugs.new %>% + filter(!is.na(ID)) %>% + select(-show_drug_name) %>% + unique() %>% + group_by(ID) %>% + mutate(standard_drug_name = unique(standard_drug_name[reference]), + family = if_else(family == "Other", "1", family), + family = paste(sort(unique(str_trim(unlist( + str_split(family, pattern = ", "))))), collapse = ", "), + family = str_remove(family, pattern = "^1, "), + family = if_else(family == "1", "Other", family), + n.status = length(unique(status)), + is.approved = any(status == "Approved"), + is.clinical = any(status == "Clinical trials"), + is.experimental = any(status == "Experimental"), + pathology = unique(pathology), + is.cancer = any(cancer == "cancer"), + is.clinicalcancer = any(cancer == "clinical cancer"), + cancer = if_else(is.cancer | is.clinicalcancer | cancer == "", "1", cancer), + cancer = paste(sort(unique(str_trim(unlist( + str_split(cancer, pattern = "\\|"))))), collapse = " | "), + cancer = str_remove(cancer, pattern = "^1 \\| "), + cancer = if_else(cancer == "1", "", cancer), + extra = if_else(extra == "", "1", extra), + extra = paste(sort(unique(str_trim(unlist( + str_split(extra, pattern = ", "))))), collapse = ", "), + extra = str_remove(extra, pattern = "^1, "), + extra = if_else(extra == "1", "", extra), + extra2 = str_remove(paste(unique(extra2), collapse = "|"), + pattern = "\\|")) %>% + ungroup() %>% + mutate(status = case_when(n.status > 1 & is.approved ~ "Approved", + n.status > 1 & is.clinical ~ "Clinical trials", + n.status > 1 & is.experimental ~ "Experimental", + n.status == 1 ~ status), + cancer = case_when(status == "Approved" & !is.clinicalcancer ~ cancer, + status == "Approved" & is.clinicalcancer ~ + "clinical cancer", + status == "Clinical trials" & is.cancer ~ "cancer", + status == "Clinical trials" ~ "", + status == "Experimental" ~ "")) %>% + select(-c(reference:is.clinicalcancer)) %>% + unique() + +# Collapse drug-gene info +updated <- updated %>% + group_by(gene_symbol) %>% + mutate(checked_gene_symbol = unique(checked_gene_symbol)) %>% + ungroup() %>% + group_by(ID, checked_gene_symbol) %>% + mutate(pathways = unique(pathways), + n.target_marker = length(unique(target_marker)), + ind_pathway = unique(ind_pathway), + gene_dependency = unique(gene_dependency), + gscore = unique(gscore), + driver_gene = unique(driver_gene)) %>% + ungroup() %>% + mutate(target_marker = if_else(n.target_marker == 2, "target", + target_marker)) %>% + unique() %>% + group_by(ID, source, checked_gene_symbol, resistance) %>% + mutate(alteration = unique(alteration)) %>% + ungroup() %>% + group_by(ID, source, checked_gene_symbol, alteration) %>% + mutate(resistance = unique(resistance)) %>% + ungroup() %>% + select(-n.target_marker) %>% + unique() + +# Merge all entries and recompute DScores +updated <- updated %>% + mutate(is.cancer = case_when(cancer %in% c("", "clinical cancer") ~ cancer, + extra == "solid tumors" ~ "cancer", + TRUE ~ "cancer"), + dscore = + case_when(target_marker == "target" & status == "Approved" & + is.cancer == "cancer" ~ 1, + target_marker == "marker" & status == "Approved" & + is.cancer == "cancer" ~ 0.9, + target_marker == "target" & status == "Approved" & + is.cancer == "clinical cancer" ~ 0.8, + target_marker == "marker" & status == "Approved" & + is.cancer == "clinical cancer" ~ 0.7, + target_marker == "target" & status == "Clinical trials" & + is.cancer == "cancer" ~ 0.6, + target_marker == "marker" & status == "Clinical trials" & + is.cancer == "cancer" ~ 0.5, + target_marker == "target" & status == "Approved" & + is.cancer == "" ~ 0.4, + target_marker == "marker" & status == "Approved" & + is.cancer == "" ~ 0.3, + target_marker == "target" & status == "Clinical trials" & + is.cancer == "" ~ 0.2, + target_marker == "marker" & status == "Clinical trials" & + is.cancer == "" ~ 0.1, + target_marker == "target" & status == "Experimental" & + is.cancer == "" ~ 0.0008, + target_marker == "marker" & status == "Experimental" & + is.cancer == "" ~ 0.0004), + dscore = if_else(resistance == "resistance", -1 * dscore, dscore)) %>% + rename(show_drug_name = ID) %>% + select(any_of(colnames(pandrugs.new))) %>% + mutate(version = "New") + +# Add to PanDrugs table +pandrugs.new <- pandrugs.new %>% + filter(is.na(ID)) %>% + select(-c(ID, reference)) %>% + mutate(version = "Old") %>% + bind_rows(updated) %>% + unique() + +# Clean VITAMIN E entry (n.status == 2 & n.cancer == 2) +vitamin.e <- pandrugs.new %>% + filter(show_drug_name == "VITAMIN E") + +vitamin.e %>% + filter(version == "Old") %>% + select(standard_drug_name, status, cancer, dscore) %>% + unique() + +vitamin.e <- vitamin.e %>% + group_by(show_drug_name) %>% + mutate(standard_drug_name = "VITAMIN E", + family = unique(family), + status = "Withdrawn", + pathology = unique(pathology), + cancer = "", + extra = unique(extra), + extra2 = unique(extra2), + dscore = 0) + +vitamin.e <- vitamin.e %>% + group_by(gene_symbol) %>% + mutate(checked_gene_symbol = unique(checked_gene_symbol)) %>% + ungroup() %>% + group_by(checked_gene_symbol, show_drug_name) %>% + mutate(pathways = unique(pathways), + target_marker = unique(target_marker), + ind_pathway = unique(ind_pathway), + gene_dependency = unique(gene_dependency), + gscore = unique(gscore), + driver_gene = unique(driver_gene)) %>% + ungroup() %>% + group_by(checked_gene_symbol, show_drug_name, source, resistance) %>% + mutate(alteration = unique(alteration)) %>% + ungroup() %>% + group_by(checked_gene_symbol, show_drug_name,source, alteration) %>% + mutate(resistance = unique(resistance)) %>% + ungroup() %>% + unique() + +pandrugs.new <- pandrugs.new %>% + filter(show_drug_name != "VITAMIN E") %>% + bind_rows(vitamin.e) %>% + unique() + +# Update MIDOSTAURIN, L-ASPARAGINASE and IDARUBICIN entries: cancer = blood +blood.drugs <- c("MIDOSTAURIN" = "TARGETED THERAPY", + "L-ASPARAGINASE" = "CHEMOTHERAPY", + "IDARUBICIN" = "CHEMOTHERAPY") + +pandrugs.new <- pandrugs.new %>% + mutate(is.blood = show_drug_name %in% names(blood.drugs), + cancer = if_else(is.blood, "blood", cancer), + extra2 = if_else(is.blood, blood.drugs[show_drug_name], extra2), + dscore = case_when(is.blood & target_marker == "target" ~ 1, + is.blood & target_marker == "marker" ~ 0.9, + TRUE ~ dscore), + dscore = if_else(is.blood & resistance == "resistance", -1 * dscore, + dscore)) + +# Remove extra columns and check collapsed columns +pandrugs.new <- pandrugs.new %>% + select(-c(version, is.blood)) %>% + unique() + +# Save table +write.table(pandrugs.new, col.names = TRUE, row.names = FALSE, quote = FALSE, + sep = "\t", file = paste0(out.dir, "PanDrugs_Jun_06_2024_gs_dr.tsv")) + +# Filter sensitivity/resistance entries +pandrugs.new <- pandrugs.new %>% + filter(resistance != "sensitivity / resistance") + +# Save table +write.table(pandrugs.new, col.names = TRUE, row.names = FALSE, quote = FALSE, + sep = "\t", file = paste0(out.dir, "PanDrugs_Jun_06_2024_gs_dr_filtered.tsv"))