This repository has been archived by the owner on Oct 4, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pathway_parser.py
83 lines (66 loc) · 2.57 KB
/
pathway_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from enum import Enum
import networkx as nx
# Nodes in a pathway can be genes or clusters of genes
# Genes inside a cluster are CHILDGENEs so that they can be treated
# independently from their parent nodes
class PathwayElement(Enum):
GENE = 1
COMPLEX = 2
FAMILY = 3
PROCESS = 4
CHILDGENE = 5
# Transforms a pathwaymapper file into a graph
def pathway_to_graph(path: str):
g = nx.Graph()
with open(path) as pwfile:
# First line in file is pathway name
title = pwfile.readline().strip()
# Second line is empty
pwfile.readline()
# Third line is pathway description, ignore for now
pwfile.readline()
# Fourth line is empty, fifth line is genes header
pwfile.readline()
pwfile.readline()
# Now parse the genes
cur_str = pwfile.readline()
genes = []
complexes = {}
while cur_str.strip() != "":
toks = cur_str.split("\t")
# An edge can connect to more than one vertex because of gene complexes or families,
# this is modelled with an intermediate node
ptype = (
PathwayElement.CHILDGENE if toks[3] != "-1" else PathwayElement[toks[2]]
)
g.add_node(toks[1], label=toks[0], ptype=ptype)
if ptype == PathwayElement.CHILDGENE:
print(toks[0] + " " + toks[1] + " -> " + toks[3])
g.add_edge(toks[1], toks[3])
cur_str = pwfile.readline()
# Hit an empty line, edge definitions should follow after the csv header
pwfile.readline()
cur_str = pwfile.readline()
while cur_str.strip() != "":
toks = cur_str.split("\t")
if toks[1] in g.nodes() and toks[2] in g.nodes():
print(toks[1] + " -> " + toks[2])
g.add_edge(toks[1], toks[2], eid=toks[0])
cur_str = pwfile.readline()
return (title, g)
# Counts genes, complexes and families
def total_gene_groups(pathway: nx.Graph) -> int:
gattrs = nx.get_node_attributes(pathway, "ptype")
return sum(
value in [PathwayElement.GENE, PathwayElement.FAMILY, PathwayElement.COMPLEX]
for value in gattrs.values()
)
# Counts all the genes present in the pathway
def total_genes(pathway: nx.Graph) -> int:
gattrs = nx.get_node_attributes(pathway, "ptype")
return sum(
value in [PathwayElement.GENE, PathwayElement.CHILDGENE]
for value in gattrs.values()
)
def get_gene_names(pathway: nx.Graph):
return nx.get_node_attributes(pathway, "label").values()