Skip to content

Commit

Permalink
Refactor for OSG/CHTC
Browse files Browse the repository at this point in the history
  • Loading branch information
alyssa-adams committed Jul 29, 2022
1 parent 401221c commit ac83025
Show file tree
Hide file tree
Showing 8 changed files with 823 additions and 283 deletions.
33 changes: 33 additions & 0 deletions CHTC/get_uniprot_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env python

from time import sleep
import requests
import json
import sys
import os

if __name__ == '__main__':

# uniprot url for node attributes
url_base = 'https://rest.uniprot.org/uniprotkb/search?query=xref:string-'
pid_file = sys.argv[1]
results = {}

with open(pid_file) as f:
for pid in f:

sleep(1)
url = url_base + pid + '&format=json'
result = requests.get(url).text
result = json.loads(result)

if len(result['results']) == 0:
result = {}
else:
result = result['results'][0]

results[pid] = result

# save to file in directory
with open(os.path.join('results', pid_file + '.json'), 'w') as f:
json.dump(results, f)
17 changes: 17 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Workflow:
Get node properties from Uniprot
Get node values during infection from ViPR
Make initial networks in Python → NetworkX objects
Trim networks with SPRAS
Network measures → Store values on nodes, edges, whole graph
Network perturbations → Store values on nodes and edges

Per job:
One network → SPRAS → Output network

Per job:
One network → One network measure → Dictionary of resulting values
How long does each measure take? Some take a long time, might need another solution for larger networks.

Per job:
One network → Measure complexity → Remove one graph object → Measure complexity → Resulting change (value)
29 changes: 29 additions & 0 deletions download_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import urllib.request
import os
import gzip
import shutil

class FileDownloader:

def single_file(self, source):

if source == 'string_edges':

# check to see if file already exists
# TODO: check file version too
file = os.path.join('data', 'input_files', 'string_edges.txt.gz')
if os.path.exists(file):
return None

url = 'http://viruses.string-db.org/download/protein.links.full.v10.5.txt.gz'
print('Downloading file from: ' + url)

# download file
urllib.request.urlretrieve(url, os.path.join('data', 'input_files', 'string_edges.txt.gz'))

# unzip
with gzip.open(os.path.join('data', 'input_files', 'string_edges.txt.gz'), 'rb') as f_in:
with open(os.path.join('data', 'input_files', 'string_edges.txt'), 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)

print('Done!')
129 changes: 78 additions & 51 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,84 +6,107 @@
import pandas as pd
import make_networks
import networkx as nx
# import multiprocessing
# from multiprocessing import Pool

import download_files

if __name__ == '__main__':

# load in classes
network_maker = make_networks.VirusStringNetworks()
# Get edges from string (make initial networks)
# Get node properties from string (add to networks)
# Get node properties from Uniprot (add to networks)
# Get node values during infection from ViPR (add to networks)
# Make initial networks in Python → NetworkX objects (save networks to files)
# Trim networks with SPRAS (trim networks, save to files)
# Network measures → Store values on nodes, edges, whole graph (add to networks)
# Network perturbations → Store values on nodes and edges (add to networks)

# ----------- make directories -----------

pickle_out = 'pickle_jar'
if not os.path.exists(pickle_out):
os.makedirs(pickle_out)
data = 'data'
if not os.path.exists(data):
os.makedirs(data)

data_jar = 'data_jar'
if not os.path.exists(data_jar):
os.makedirs(data_jar)
measures = os.path.join('data', 'measures')
if not os.path.exists(measures):
os.makedirs(measures)

networks_jar = 'networks'
if not os.path.exists(networks_jar):
os.makedirs(networks_jar)
input_files = os.path.join('data', 'input_files')
if not os.path.exists(input_files):
os.makedirs(input_files)

cytoscape_jar = os.path.join('networks', 'cytoscape')
if not os.path.exists(cytoscape_jar):
os.makedirs(cytoscape_jar)
networks = 'networks'
if not os.path.exists(networks):
os.makedirs(networks)

graphspace_jar = os.path.join('networks', 'graphspace')
if not os.path.exists(graphspace_jar):
os.makedirs(graphspace_jar)
cytoscape = os.path.join('networks', 'cytoscape')
if not os.path.exists(cytoscape):
os.makedirs(cytoscape)

# ----------- load in the networks -----------
full = os.path.join('networks', 'full')
if not os.path.exists(full):
os.makedirs(full)

# STRING ONLY
data_dir = 'data_jar'
edges_file = os.path.join(data_dir, 'protein.links.full.v10.5.txt')
nodes_dir = os.path.join(data_dir)
networks_file_out = os.path.join('networks', 'string_networks.p')
trimmed = os.path.join('networks', 'trimmed')
if not os.path.exists(trimmed):
os.makedirs(trimmed)

final = os.path.join('networks', 'final')
if not os.path.exists(final):
os.makedirs(final)

# ----------- download input files -----------

source = 'string_edges'
file_downloader = download_files.FileDownloader()
file_downloader.single_file(source)

# ----------- make full networks -----------

network_maker = make_networks.MakeNetworks()

# save edge-only network to file
edge_only_network_file = os.path.join(full, 'edges-only.p')
#network_maker.make_edge_network(os.path.join('data', 'input_files', 'string_edges.txt'), edge_only_network_file)

# ----------- explore these networks -----------
# get node list for CHTC
list_file = os.path.join(full, 'pid_input_files', 'protein_ids_')
#network_maker.make_node_list_chtc(edge_only_network_file, list_file)

# n edges = 3,311,139
# n_edges = sum(list(map(lambda x: len(list(networks[x].edges())), networks)))
# collect uniprot data using CHTC
# copy CHTC/get_uniprot_data.py and pid_input_files to HTC.learn
# transfer results/ from HTC back to networks/full

# n host nodes = 365,437
# list(filter(lambda x: node_information[x]['type']=='host' and node_information[x]['uniprot_id'] is not None, node_information.keys()))
# n hosts nodes with uniprot ids = 121,785
# n hosts = 64 (11 with uniprot ids)
# add uniprot node information to networks

# n virus nodes = 4703
# n viruses = 184
# ----------- trim networks -----------

# n ncbi ids = 248 = n organisms
# use SPRAS and ViPR to trim networks

# components = [G.subgraph(c).copy() for c in nx.connected_components(G)]
# n components = 4639
# ----------- measure networks -----------

# number of inferred edges = 2,982,720
#inferred_edges = [len(list(filter(lambda x: networks[network][x[0]][x[1]]['experiments'] == 0 and networks[network][x[0]][x[1]]['database'] == 0, list(networks[network].edges())))) for network in networks]
# measure networks

# number of database edges = 193,146
#database_edges = [len(list(filter(lambda x: networks[network][x[0]][x[1]]['database'] != 0, list(networks[network].edges())))) for network in networks]
# ----------- plot results -----------

# number of experiment edges = 176,594
#experiment_edges = [len(list(filter(lambda x: networks[network][x[0]][x[1]]['experiments'] != 0, list(networks[network].edges())))) for network in networks]

# ----------- Filter the networks -----------





edges_file = os.path.join(data_dir, 'protein.links.full.v10.5.txt')
nodes_dir = os.path.join(data_dir)
networks_file_out = os.path.join('OLD/networks', 'string_networks.p')

# Evidence-based and textmined separate networks

# 1. Filter by edge type and remove lone nodes
# 2. Filter nodes to ones that only contain virus-host nodes
# 3. Take subgraph of those nodes only

filtered_experiments_networks_file_out = os.path.join('networks', 'filtered_experiments_string_networks.p')
filtered_textmining_networks_file_out = os.path.join('networks', 'filtered_textmining_string_networks.p')
measured_experiments_networks_file_out = os.path.join('networks', 'measured_experiments_string_networks.p')
measured_textmining_networks_file_out = os.path.join('networks', 'measured_textmining_string_networks.p')
filtered_experiments_networks_file_out = os.path.join('OLD/networks', 'filtered_experiments_string_networks.p')
filtered_textmining_networks_file_out = os.path.join('OLD/networks', 'filtered_textmining_string_networks.p')
measured_experiments_networks_file_out = os.path.join('OLD/networks', 'measured_experiments_string_networks.p')
measured_textmining_networks_file_out = os.path.join('OLD/networks', 'measured_textmining_string_networks.p')

# check to see if filtered networks are already made
if not os.path.exists(filtered_experiments_networks_file_out) and not \
Expand All @@ -110,7 +133,11 @@
with open(filtered_textmining_networks_file_out, 'rb') as f:
filtered_textmining_networks = pickle.load(f)

# ----------- make df of network measures -----------
# ----------- trim networks with SPRAS (CHTC?) -----------

# download and use proteomic data

# ----------- measure networks (refactor for CHTC) -----------

# just basic measures
# add complexity measures
Expand All @@ -137,7 +164,7 @@

# load in the csv of measures
try:
measures_data = pd.read_csv(os.path.join('data_jar', 'measures_experiments_' + network_id + '.csv'))
measures_data = pd.read_csv(os.path.join('OLD/data_jar', 'measures_experiments_' + network_id + '.csv'))
except:
continue

Expand Down
Loading

0 comments on commit ac83025

Please sign in to comment.