Intermediate pickle files and DB formatting

gitter-lab · Nov 19, 2021 · 73dde9f · 73dde9f
1 parent 43d283f
commit 73dde9f
Showing 1 changed file with 141 additions and 0 deletions.
diff --git a/load_in_graphs.py b/load_in_graphs.py
@@ -1,5 +1,7 @@
 import csv
 import os
+import pickle
+import pandas
 
 # one class:
 # one def per data source
@@ -72,3 +74,142 @@ def make_ppi_out_edges(file_in, file_out):
     make_ppi_out_edges(file_in, file_out)
 with open(file_out, 'rb') as f:
     ppi_out_edges = pickle.load(f)
+
+files = os.listdir(pickle_out)
+files = list(filter(lambda x: not re.search('DS_Store', x) and not re.search('diffs', x)
+                               and re.search('whole_bdm', x), files))
+
+
+def make_df(bdm_pickle_files, id_table, metabolic_data, file_out):
+
+    '''
+    Makes the initial dataframe of protein values, one row per protein
+    :param bdm_pickle_files: List of pickled bdm files
+    :param id_table: id translation table to get ncbi ids
+    :param metabolic_data: df of metabolic output
+    :param file_out: save this df to a pickle file with this path
+    :return: None, just saves a pickle file
+    '''
+
+    dict_to_df = {}
+
+    for file in bdm_pickle_files:
+
+        with open(os.path.join(pickle_out, file), 'rb') as f:
+
+            bdms = pickle.load(f)
+
+            for k in bdms.keys():
+
+                # get NCBI id
+                if k in id_table[0].keys():
+                    ncbi_id = id_table[0][k]  # the first one is the NCBI number
+                else:
+                    ncbi_id = None
+
+                protein_type = None
+                species = None
+                family = None
+                order = None
+                clas = None
+                phylum = None
+                kingdom = None
+                out_edges = None
+
+                # only interested in ones with known names, but keep non-matches in dict anyways
+                if ncbi_id:
+
+                    # get species name
+                    species = ncbi.get_taxid_translator([int(ncbi_id)])
+                    species = list(species.values())[0]
+
+                    # decide if virus or host
+                    if species:
+                        if re.search(' virus ', species) or re.search(' phage ', species):
+                            protein_type = 'virus'
+                        else:
+                            protein_type = 'host'
+                    else:
+                        if re.search(' virus ', bdms[k]['group']) or re.search(' phage ', bdms[k]['group']):
+                            protein_type = 'virus'
+                        else:
+                            protein_type = 'host'
+
+                    # get taxonomy information of host
+                    if protein_type == 'host':
+                        desired_ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
+                        ranks = biobdm.get_desired_ranks(ncbi_id, desired_ranks)
+                        family = ranks['family_id']
+                        order = ranks['order_id']
+                        clas = ranks['class_id']
+                        phylum = ranks['phylum_id']
+                        kingdom = ranks['kingdom_id']
+
+                else:
+
+                    # decide if virus or host
+                    if re.search(' virus ', bdms[k]['group']) or re.search(' phage ', bdms[k]['group']):
+                        protein_type = 'virus'
+                    else:
+                        protein_type = 'host'
+
+                    ncbi_id = None
+                    species = None
+                    family = None
+                    order = None
+                    clas = None
+                    phylum = None
+                    kingdom = None
+
+                # get out-edges of PPI network (just ncbi id)
+                try:
+                    out_edges = ppi_out_edges[k]
+                except:
+                    out_edges = None
+
+                # save all this information to dict_to_df
+                dict_to_df[k] = {}
+                dict_to_df[k]['string_id'] = k
+                dict_to_df[k]['group'] = bdms[k]['group']
+                dict_to_df[k]['whole_bdm'] = bdms[k]['whole_bdm']
+                dict_to_df[k]['length'] = bdms[k]['length']
+                dict_to_df[k]['protein_type'] = protein_type
+                dict_to_df[k]['ncbi_id'] = ncbi_id
+                dict_to_df[k]['species'] = species
+                dict_to_df[k]['family'] = family
+                dict_to_df[k]['order'] = order
+                dict_to_df[k]['class'] = clas
+                dict_to_df[k]['phylum'] = phylum
+                dict_to_df[k]['kingdom'] = kingdom
+                dict_to_df[k]['out_edges'] = out_edges
+
+    df = pd.DataFrame.from_dict(dict_to_df, orient='index')
+    df = df.reset_index(drop=True)
+
+    # add in metabolic output
+    df_to_join = {}
+
+    # for each cell in the last column, loop over the values and add category and function to graph
+    for index, row in metabolic_data.iterrows():
+        category = row[0]
+        function = row[1]
+        protein_ids = row[-1].split(',')
+
+        for protein in protein_ids:
+            df_to_join[protein] = {}
+            df_to_join[protein]['string_id'] = protein
+            df_to_join[protein]['category'] = category
+            df_to_join[protein]['function'] = function
+
+    df_to_join = pd.DataFrame.from_dict(df_to_join, orient='index')
+
+    # do a join on the two
+    df = df.set_index('string_id').join(df_to_join.set_index('string_id'))
+    df = df.reset_index()
+
+    # pickle this DF to load in all at once (must have lots of ram)
+    with open(file_out, 'wb') as handle:
+        pickle.dump(df, handle)
+
+    return None
+