Skip to content
This repository was archived by the owner on Mar 11, 2019. It is now read-only.

Commit b3ad651

Browse files
Luis Francisco Hernández SánchezLuis Francisco Hernández Sánchez
Luis Francisco Hernández Sánchez
authored and
Luis Francisco Hernández Sánchez
committed
Updated mapping files to Reactome v65
1 parent b0ffad8 commit b3ad651

35 files changed

+279
-0
lines changed

docs/statistics.md

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
# Physical entities
2+
3+
4+
5+
# Proteins
6+
7+
* Count proteins:
8+
~~~~
9+
MATCH (re:ReferenceEntity{databaseName:'UniProt'})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:'Homo sapiens'})
10+
RETURN count(DISTINCT re.identifier) as protein
11+
~~~~
12+
13+
* Number of reactions per protein:
14+
~~~~
15+
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
16+
WITH re, pe
17+
MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
18+
WHERE r.speciesName = 'Homo sapiens'
19+
WITH re.identifier as protein, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT r.stId) as reactionSet
20+
RETURN DISTINCT protein, size(reactionSet) as reactionCount
21+
~~~~
22+
23+
* Number of pathways for each protein:
24+
~~~~
25+
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
26+
WITH re, pe
27+
MATCH (p:Pathway{speciesName:'Homo sapiens'})-[:hasEvent*]->(r:Reaction{speciesName:'Homo sapiens'})-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
28+
WITH re.identifier as protein, collect(DISTINCT p.stId) as pathwaySet
29+
RETURN protein, size(pathwaySet) as pathwayCount
30+
~~~~
31+
32+
* Number of reactions and pathways for each protein:
33+
~~~~
34+
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
35+
WITH re, pe
36+
OPTIONAL MATCH (p:Pathway{speciesName:'Homo sapiens'})-[:hasEvent*]->(r:Reaction{speciesName:'Homo sapiens'})-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
37+
WITH re.identifier as protein, collect(DISTINCT p.stId) as pathwaySet, collect(DISTINCT r.stId) as reactionSet
38+
RETURN protein, size(pathwaySet) as pathwayCount, size(reactionSet) as reactionCount
39+
~~~~
40+
41+
* Statistics of reactions for all proteins:
42+
~~~~
43+
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
44+
WITH re, pe
45+
MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
46+
WHERE r.speciesName = 'Homo sapiens'
47+
WITH re.identifier as protein, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT r.stId) as reactionSet
48+
WITH DISTINCT protein, size(reactionSet) as reactionCount
49+
RETURN min(reactionCount) as minReactionCount, avg(reactionCount) as avgReactionCount, max(reactionCount) as maxReactionCount
50+
~~~~
51+
52+
* Statistics of pathways for all proteins:
53+
~~~~
54+
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
55+
WITH re, pe
56+
OPTIONAL MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
57+
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens'
58+
WITH re.identifier as protein, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT p.stId) as pathwaySet
59+
WITH protein, size(pathwaySet) as pathwayCount
60+
RETURN min(pathwayCount) as minPathwayCount, avg(pathwayCount) as avgPathwayCount, max(pathwayCount) as maxPathwayCount
61+
~~~~
62+
63+
### Results
64+
65+
* Reactions per protein
66+
67+
|Min|Average|Max|
68+
|---|---|---|
69+
|0|7.083|315|
70+
71+
* Reactions per protein with at least one annotated reaction
72+
73+
|Min |Average | Max|
74+
| --- | --- | --- |
75+
| 1 | 7.424 | 315 |
76+
77+
* Pathways per protein
78+
79+
|Min |Average | Max|
80+
| --- | --- | --- |
81+
|0|8.330|292|
82+
83+
* For proteins with at least one pathway annotated
84+
85+
|Min |Average | Max|
86+
| --- | --- | --- |
87+
|1|8.733|292|
88+
89+
90+
---------------------------------------------------------------------------------------------------------------------------
91+
# Proteoforms
92+
93+
* PTM sets per PhysicalEntity, including proteins without modifications (include empty sets)
94+
~~~~
95+
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
96+
WITH re, pe
97+
OPTIONAL MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
98+
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens'
99+
WITH DISTINCT re.identifier as protein, pe, collect(DISTINCT p.stId) as pathwaySet
100+
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(mr)
101+
RETURN DISTINCT protein, pe.stId, pathwaySet, collect(mr.coordinate) as ptmSet
102+
ORDER BY protein
103+
~~~~
104+
105+
* Count PTMSets for each protein:
106+
~~~~
107+
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
108+
WITH re, pe
109+
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(mr)
110+
WITH DISTINCT re.identifier as protein, pe, collect(mr.coordinate) as ptmSet
111+
WITH DISTINCT protein, size(collect(ptmSet)) as ptmSetCount
112+
RETURN min(ptmSetCount), avg(ptmSetCount), max(ptmSetCount)
113+
~~~~
114+
115+
* Get all proteoforms: 13812
116+
~~~~
117+
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
118+
WITH DISTINCT pe, re
119+
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
120+
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type
121+
ORDER BY type, coordinate
122+
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
123+
WITH DISTINCT pe, re, ptms
124+
RETURN DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, collect(DISTINCT pe.stId) as equivalentPe, ptms
125+
~~~~
126+
127+
* Number of reactions of each proteoform:
128+
~~~~
129+
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
130+
WITH DISTINCT pe, re
131+
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
132+
WITH DISTINCT pe,
133+
re,
134+
tm.coordinate as coordinate,
135+
mod.identifier as type
136+
ORDER BY type, coordinate
137+
WITH DISTINCT
138+
pe,
139+
re,
140+
COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
141+
WITH DISTINCT pe, re, ptms
142+
OPTIONAL MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
143+
WHERE r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens'
144+
RETURN DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, ptms, size(collect(DISTINCT r.stId)) as reactionCount
145+
ORDER BY protein
146+
~~~~
147+
148+
* Statistics of reactions of each proteoform:
149+
~~~~
150+
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
151+
WITH DISTINCT pe, re
152+
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
153+
WITH DISTINCT pe,
154+
re,
155+
tm.coordinate as coordinate,
156+
mod.identifier as type
157+
ORDER BY type, coordinate
158+
WITH DISTINCT
159+
pe,
160+
re,
161+
COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
162+
WITH DISTINCT pe, re, ptms
163+
OPTIONAL MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
164+
WHERE r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens'
165+
WITH DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, collect(DISTINCT pe.stId) as equivalentPe, ptms, size(collect(DISTINCT r.stId)) as reactionCount
166+
RETURN min(reactionCount) as minReactionCount, avg(reactionCount) as avgReactionCount, max(reactionCount) as maxReactionCount
167+
~~~~
168+
169+
* Number of pathways for each proteoform:
170+
~~~~
171+
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
172+
WITH DISTINCT pe, re
173+
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
174+
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type
175+
ORDER BY type, coordinate
176+
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
177+
WITH DISTINCT pe, re, ptms
178+
MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
179+
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens'
180+
RETURN DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, ptms, size(collect(DISTINCT p.stId)) as pathwayCount
181+
~~~~
182+
183+
* Statistics of pathways for each proteoform:
184+
~~~
185+
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
186+
WITH DISTINCT pe, re
187+
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
188+
WITH DISTINCT pe,
189+
re,
190+
tm.coordinate as coordinate,
191+
mod.identifier as type
192+
ORDER BY type, coordinate
193+
WITH DISTINCT
194+
pe,
195+
re,
196+
COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
197+
WITH DISTINCT pe, re, ptms
198+
OPTIONAL MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
199+
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens'
200+
WITH DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, collect(DISTINCT pe.stId) as equivalentPe, ptms, size(collect(DISTINCT p.stId)) as pathwayCount
201+
RETURN min(pathwayCount) as minPathwayCount, avg(pathwayCount) as avgPathwayCount, max(pathwayCount) as maxPathwayCount
202+
~~~
203+
204+
### Results
205+
206+
* PTM sets per protein
207+
208+
|min|average|max|
209+
|---|---|---|
210+
|1|2.232|316|
211+
212+
* Reactions per proteoform
213+
214+
|min|average|max|
215+
|---|---|---|
216+
|0|6.128|313|
217+
218+
* Reactions per proteoform with at least one reaction annotated
219+
220+
|min|average|max|
221+
|---|---|---|
222+
|1|6.494|313|
223+
224+
* Pathways per proteoform
225+
226+
|min|average|max|
227+
|---|---|---|
228+
|0|7.66|291|
229+
230+
* Pathways per proteoform with at least one pathway annotated
231+
232+
|min|average|max|
233+
|---|---|---|
234+
|1|8.122|291|
235+
236+
* Note: Only counting the number of pathways and reactions for the proteins that actually have at least one reaction and pathway annotated. Because in the query, I am not using optional match for the pathway and reaction connection. Use the "OPTIONAL MATCH" for all proteins.
237+
* Note: Finds all the pathways using the reactions, no matter the level in the pathway hierarchy.
238+
239+
240+
* Top proteins participating in most reactions:
241+
~~~~
242+
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
243+
WITH re, pe
244+
MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
245+
WHERE r.speciesName = 'Homo sapiens'
246+
WITH re.identifier as protein, re.displayName as name, re.description as description, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT r.stId) as reactionSet
247+
RETURN DISTINCT protein, name, description, size(reactionSet) as reactionCount
248+
ORDER BY reactionCount DESC
249+
LIMIT 10
250+
~~~~
251+
252+
* Top proteins participating in most pathways ant its containing reactions:
253+
~~~~
254+
MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"})
255+
WITH re, pe
256+
OPTIONAL MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
257+
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens'
258+
WITH re.identifier as protein, re.displayName as name, re.description as description, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT r.stId) as reactionSet, collect(DISTINCT p.stId) as pathwaySet
259+
RETURN DISTINCT protein, name, description, size(reactionSet) as reactionCount, size(pathwaySet) as pathwayCount
260+
ORDER BY pathwayCount DESC, reactionCount DESC, protein, name
261+
LIMIT 10
262+
~~~~
263+
264+
* Top proteoforms participating in most pathways and its containing reactions:
265+
~~~
266+
MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'})
267+
WITH DISTINCT pe, re
268+
OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod)
269+
WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type
270+
ORDER BY type, coordinate
271+
WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms
272+
WITH DISTINCT pe, re, ptms
273+
MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe)
274+
WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens'
275+
RETURN DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, re.displayName as name, re.description as description, ptms, collect(DISTINCT pe.stId) as peSet, size(collect(DISTINCT r.stId)) as reactionCount, size(collect(DISTINCT p.stId)) as pathwayCount
276+
ORDER BY pathwayCount DESC, reactionCount DESC, protein, name
277+
LIMIT 10
278+
~~~
279+

resources/iPathways.gz

279 Bytes
Binary file not shown.

resources/iProteins.gz

262 Bytes
Binary file not shown.

resources/imapChrBpToProteins1.gz

1.47 KB
Binary file not shown.

resources/imapChrBpToProteins16.gz

1.9 KB
Binary file not shown.

resources/imapChrBpToProteins18.gz

875 Bytes
Binary file not shown.

resources/imapChrBpToProteins19.gz

1.85 KB
Binary file not shown.

resources/imapChrBpToProteins3.gz

-1.95 KB
Binary file not shown.

resources/imapChrBpToProteins5.gz

2.68 KB
Binary file not shown.

resources/imapChrBpToProteins8.gz

6.91 KB
Binary file not shown.

resources/imapChrBpToProteins9.gz

5.01 KB
Binary file not shown.

resources/imapComplexesToProteins.gz

440 Bytes
Binary file not shown.
835 Bytes
Binary file not shown.

resources/imapEnsemblToProteins.gz

2.9 KB
Binary file not shown.

resources/imapGenesToProteins.gz

207 Bytes
Binary file not shown.
73 Bytes
Binary file not shown.

resources/imapProteinsToComplexes.gz

1.31 KB
Binary file not shown.
891 Bytes
Binary file not shown.

resources/imapProteinsToReactions.gz

1.47 KB
Binary file not shown.

resources/imapProteinsToSets.gz

223 Bytes
Binary file not shown.
1.78 KB
Binary file not shown.
3.23 KB
Binary file not shown.

resources/imapProteoformsToSets.gz

727 Bytes
Binary file not shown.

resources/imapReactions.gz

5.37 KB
Binary file not shown.

resources/imapReactionsToPathways.gz

275 Bytes
Binary file not shown.

resources/imapRsIdsToProteins1.gz

3.03 KB
Binary file not shown.

resources/imapRsIdsToProteins16.gz

4.25 KB
Binary file not shown.

resources/imapRsIdsToProteins18.gz

1.85 KB
Binary file not shown.

resources/imapRsIdsToProteins19.gz

3.64 KB
Binary file not shown.

resources/imapRsIdsToProteins3.gz

-4.06 KB
Binary file not shown.

resources/imapRsIdsToProteins5.gz

6.09 KB
Binary file not shown.

resources/imapRsIdsToProteins8.gz

10.4 KB
Binary file not shown.

resources/imapRsIdsToProteins9.gz

9.32 KB
Binary file not shown.

resources/imapSetsToProteins.gz

169 Bytes
Binary file not shown.

resources/imapSetsToProteoforms.gz

652 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)