From 674609895086df085db1c802a77ed2ab345f9c57 Mon Sep 17 00:00:00 2001
From: Nicole Gay <nicolerg@stanford.edu>
Date: Wed, 1 Mar 2023 09:01:48 -0800
Subject: [PATCH] finalize files for data hub

---
 inst/scripts/match-package-to-data-hub.R | 74 +++++++++++++++++++++---
 1 file changed, 66 insertions(+), 8 deletions(-)

diff --git a/inst/scripts/match-package-to-data-hub.R b/inst/scripts/match-package-to-data-hub.R
index 1c7d701..9ad1ff2 100644
--- a/inst/scripts/match-package-to-data-hub.R
+++ b/inst/scripts/match-package-to-data-hub.R
@@ -1,9 +1,9 @@
 #!/bin/R
 # Nicole Gay
 # 1/11/23
+# Updated 3/1/23
 
-library(MotrpacRatTraining6mo)
-library(MotrpacRatTraining6moData)
+library(MotrpacRatTraining6mo) # also attaches MotrpacRatTraining6moData
 library(data.table)
 secret = "it's a secret"
 secret2 = "it's also a secret"
@@ -323,21 +323,79 @@ colors = rbindlist(color_list)
 colors[hex_colour == "white", hex_colour := "#FFFFFF"]
 write.table(colors, "~/Desktop/pass1b-06_color_codes.txt", col.names = TRUE, row.names = FALSE, quote = FALSE, sep="\t")
 
-####### I LEFT OFF HERE
-# TODO
+#### Write text files for lists #### 
 
-# custom handling for lists/nested lists 
 # "GENE_UNIVERSES"
 names(GENE_UNIVERSES)
 names(GENE_UNIVERSES$gene_symbol)
 names(GENE_UNIVERSES$gene_symbol$IMMUNO)
 
+# for each ID type, one column per tissue and ome 
+for (id_type in names(GENE_UNIVERSES)){
+  cols = list() # names: ome_tissue
+  row_ome = c()
+  row_tissue = c()
+  longest = 0
+  for(ome in names(GENE_UNIVERSES[[id_type]])){
+    for(tissue in names(GENE_UNIVERSES[[id_type]][[ome]])){
+      label = sprintf("%s_%s", ome, tissue)
+      genes = GENE_UNIVERSES[[id_type]][[ome]][[tissue]]
+      genes = genes[order(genes)]
+      longest = max(longest, length(genes))
+      row_ome = c(row_ome, ome)
+      row_tissue = c(row_tissue, tissue)
+      cols[[label]] = as.character(genes)
+    }
+  }
+  # now extend each list to longest
+  cols_filled = lapply(cols, function(x){
+    c(x, rep(NA_character_, times=(longest-length(x))))
+  })
+  # now make it a data.table
+  dt = data.table::copy(cols_filled)
+  setDT(dt)   
+  # add column headers
+  header = data.table(V1 = row_ome, V2 = row_tissue)
+  header = data.table(t(header))
+  dt = rbindlist(list(header, dt), use.names=FALSE)
+  
+  write.table(dt, file=sprintf("~/Desktop/GENE_UNIVERSES_by_%s.txt",id_type), col.names=FALSE, row.names=FALSE, quote=FALSE, sep='\t')
+}
 
 # "PATHWAY_PARENTS"
-
+head(names(PATHWAY_PARENTS))
+dt = data.table(PATHWAY_ID = names(PATHWAY_PARENTS),
+                PATHWAY_PARENTS = unname(unlist(PATHWAY_PARENTS)))
+write.table(dt, file=sprintf("~/Desktop/PATHWAY_PARENTS.txt"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep='\t')
 
 # "REPFDR_INPUTS"
-
+names(REPFDR_INPUTS)
+for(f in names(REPFDR_INPUTS)){
+  df = as.data.frame(REPFDR_INPUTS[[f]])
+  if(!is.null(rownames(df)) & !'feature' %in% colnames(df)){
+    df = cbind(feature=rownames(df), df)
+  }
+  write.table(df, file=sprintf("~/Desktop/REPFDR_INPUTS_%s.txt",f), col.names=TRUE, row.names=FALSE, quote=FALSE, sep='\t')
+}
 
 # "REPFDR_RES"
-
+names(REPFDR_RES)
+# repfdr_em_res
+dt = data.table(cbind(data.table(feature=rownames(REPFDR_RES$repfdr_em_res$mat)),
+                      REPFDR_RES$repfdr_em_res$mat))
+write.table(dt, file=sprintf("~/Desktop/REPFDR_RES_repfdr_em_res_matrix.txt"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep='\t')
+
+dt = data.table(cbind(data.table(state=rownames(REPFDR_RES$repfdr_em_res$Pi)),
+                      REPFDR_RES$repfdr_em_res$Pi))
+write.table(dt, file=sprintf("~/Desktop/REPFDR_RES_repfdr_em_res_Pi.txt"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep='\t')
+
+# repfdr_clusters
+dt = data.table(cbind(data.table(rowname=rownames(REPFDR_RES$repfdr_clusters)),
+                      REPFDR_RES$repfdr_clusters))
+write.table(dt, file=sprintf("~/Desktop/REPFDR_RES_repfdr_clusters.txt"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep='\t')
+
+# repfdr_clusters_str
+# repfdr_clusters_pi
+dt = data.table(cluster = names(REPFDR_RES$repfdr_clusters_pi),
+                pi = REPFDR_RES$repfdr_clusters_pi)
+write.table(dt, file=sprintf("~/Desktop/REPFDR_RES_repfdr_clusters_pi.txt"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep='\t')