Skip to content

Commit

Permalink
Intermediate pickle files and DB formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
alyssa-adams committed Nov 19, 2021
1 parent 43d283f commit 73dde9f
Showing 1 changed file with 141 additions and 0 deletions.
141 changes: 141 additions & 0 deletions load_in_graphs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import csv
import os
import pickle
import pandas

# one class:
# one def per data source
Expand Down Expand Up @@ -72,3 +74,142 @@ def make_ppi_out_edges(file_in, file_out):
make_ppi_out_edges(file_in, file_out)
with open(file_out, 'rb') as f:
ppi_out_edges = pickle.load(f)

files = os.listdir(pickle_out)
files = list(filter(lambda x: not re.search('DS_Store', x) and not re.search('diffs', x)
and re.search('whole_bdm', x), files))


def make_df(bdm_pickle_files, id_table, metabolic_data, file_out):

'''
Makes the initial dataframe of protein values, one row per protein
:param bdm_pickle_files: List of pickled bdm files
:param id_table: id translation table to get ncbi ids
:param metabolic_data: df of metabolic output
:param file_out: save this df to a pickle file with this path
:return: None, just saves a pickle file
'''

dict_to_df = {}

for file in bdm_pickle_files:

with open(os.path.join(pickle_out, file), 'rb') as f:

bdms = pickle.load(f)

for k in bdms.keys():

# get NCBI id
if k in id_table[0].keys():
ncbi_id = id_table[0][k] # the first one is the NCBI number
else:
ncbi_id = None

protein_type = None
species = None
family = None
order = None
clas = None
phylum = None
kingdom = None
out_edges = None

# only interested in ones with known names, but keep non-matches in dict anyways
if ncbi_id:

# get species name
species = ncbi.get_taxid_translator([int(ncbi_id)])
species = list(species.values())[0]

# decide if virus or host
if species:
if re.search(' virus ', species) or re.search(' phage ', species):
protein_type = 'virus'
else:
protein_type = 'host'
else:
if re.search(' virus ', bdms[k]['group']) or re.search(' phage ', bdms[k]['group']):
protein_type = 'virus'
else:
protein_type = 'host'

# get taxonomy information of host
if protein_type == 'host':
desired_ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
ranks = biobdm.get_desired_ranks(ncbi_id, desired_ranks)
family = ranks['family_id']
order = ranks['order_id']
clas = ranks['class_id']
phylum = ranks['phylum_id']
kingdom = ranks['kingdom_id']

else:

# decide if virus or host
if re.search(' virus ', bdms[k]['group']) or re.search(' phage ', bdms[k]['group']):
protein_type = 'virus'
else:
protein_type = 'host'

ncbi_id = None
species = None
family = None
order = None
clas = None
phylum = None
kingdom = None

# get out-edges of PPI network (just ncbi id)
try:
out_edges = ppi_out_edges[k]
except:
out_edges = None

# save all this information to dict_to_df
dict_to_df[k] = {}
dict_to_df[k]['string_id'] = k
dict_to_df[k]['group'] = bdms[k]['group']
dict_to_df[k]['whole_bdm'] = bdms[k]['whole_bdm']
dict_to_df[k]['length'] = bdms[k]['length']
dict_to_df[k]['protein_type'] = protein_type
dict_to_df[k]['ncbi_id'] = ncbi_id
dict_to_df[k]['species'] = species
dict_to_df[k]['family'] = family
dict_to_df[k]['order'] = order
dict_to_df[k]['class'] = clas
dict_to_df[k]['phylum'] = phylum
dict_to_df[k]['kingdom'] = kingdom
dict_to_df[k]['out_edges'] = out_edges

df = pd.DataFrame.from_dict(dict_to_df, orient='index')
df = df.reset_index(drop=True)

# add in metabolic output
df_to_join = {}

# for each cell in the last column, loop over the values and add category and function to graph
for index, row in metabolic_data.iterrows():
category = row[0]
function = row[1]
protein_ids = row[-1].split(',')

for protein in protein_ids:
df_to_join[protein] = {}
df_to_join[protein]['string_id'] = protein
df_to_join[protein]['category'] = category
df_to_join[protein]['function'] = function

df_to_join = pd.DataFrame.from_dict(df_to_join, orient='index')

# do a join on the two
df = df.set_index('string_id').join(df_to_join.set_index('string_id'))
df = df.reset_index()

# pickle this DF to load in all at once (must have lots of ram)
with open(file_out, 'wb') as handle:
pickle.dump(df, handle)

return None

0 comments on commit 73dde9f

Please sign in to comment.