|
| 1 | +# Physical entities |
| 2 | + |
| 3 | + |
| 4 | + |
| 5 | +# Proteins |
| 6 | + |
| 7 | +* Count proteins: |
| 8 | +~~~~ |
| 9 | +MATCH (re:ReferenceEntity{databaseName:'UniProt'})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:'Homo sapiens'}) |
| 10 | +RETURN count(DISTINCT re.identifier) as protein |
| 11 | +~~~~ |
| 12 | + |
| 13 | +* Number of reactions per protein: |
| 14 | +~~~~ |
| 15 | +MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"}) |
| 16 | +WITH re, pe |
| 17 | +MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe) |
| 18 | +WHERE r.speciesName = 'Homo sapiens' |
| 19 | +WITH re.identifier as protein, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT r.stId) as reactionSet |
| 20 | +RETURN DISTINCT protein, size(reactionSet) as reactionCount |
| 21 | +~~~~ |
| 22 | + |
| 23 | +* Number of pathways for each protein: |
| 24 | +~~~~ |
| 25 | +MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"}) |
| 26 | +WITH re, pe |
| 27 | +MATCH (p:Pathway{speciesName:'Homo sapiens'})-[:hasEvent*]->(r:Reaction{speciesName:'Homo sapiens'})-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe) |
| 28 | +WITH re.identifier as protein, collect(DISTINCT p.stId) as pathwaySet |
| 29 | +RETURN protein, size(pathwaySet) as pathwayCount |
| 30 | +~~~~ |
| 31 | + |
| 32 | +* Number of reactions and pathways for each protein: |
| 33 | +~~~~ |
| 34 | +MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"}) |
| 35 | +WITH re, pe |
| 36 | +OPTIONAL MATCH (p:Pathway{speciesName:'Homo sapiens'})-[:hasEvent*]->(r:Reaction{speciesName:'Homo sapiens'})-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe) |
| 37 | +WITH re.identifier as protein, collect(DISTINCT p.stId) as pathwaySet, collect(DISTINCT r.stId) as reactionSet |
| 38 | +RETURN protein, size(pathwaySet) as pathwayCount, size(reactionSet) as reactionCount |
| 39 | +~~~~ |
| 40 | + |
| 41 | +* Statistics of reactions for all proteins: |
| 42 | +~~~~ |
| 43 | +MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"}) |
| 44 | +WITH re, pe |
| 45 | +MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe) |
| 46 | +WHERE r.speciesName = 'Homo sapiens' |
| 47 | +WITH re.identifier as protein, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT r.stId) as reactionSet |
| 48 | +WITH DISTINCT protein, size(reactionSet) as reactionCount |
| 49 | +RETURN min(reactionCount) as minReactionCount, avg(reactionCount) as avgReactionCount, max(reactionCount) as maxReactionCount |
| 50 | +~~~~ |
| 51 | + |
| 52 | +* Statistics of pathways for all proteins: |
| 53 | +~~~~ |
| 54 | +MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"}) |
| 55 | +WITH re, pe |
| 56 | +OPTIONAL MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe) |
| 57 | +WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens' |
| 58 | +WITH re.identifier as protein, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT p.stId) as pathwaySet |
| 59 | +WITH protein, size(pathwaySet) as pathwayCount |
| 60 | +RETURN min(pathwayCount) as minPathwayCount, avg(pathwayCount) as avgPathwayCount, max(pathwayCount) as maxPathwayCount |
| 61 | +~~~~ |
| 62 | + |
| 63 | +### Results |
| 64 | + |
| 65 | +* Reactions per protein |
| 66 | + |
| 67 | +|Min|Average|Max| |
| 68 | +|---|---|---| |
| 69 | +|0|7.083|315| |
| 70 | + |
| 71 | +* Reactions per protein with at least one annotated reaction |
| 72 | + |
| 73 | +|Min |Average | Max| |
| 74 | +| --- | --- | --- | |
| 75 | +| 1 | 7.424 | 315 | |
| 76 | + |
| 77 | +* Pathways per protein |
| 78 | + |
| 79 | +|Min |Average | Max| |
| 80 | +| --- | --- | --- | |
| 81 | +|0|8.330|292| |
| 82 | + |
| 83 | +* For proteins with at least one pathway annotated |
| 84 | + |
| 85 | +|Min |Average | Max| |
| 86 | +| --- | --- | --- | |
| 87 | +|1|8.733|292| |
| 88 | + |
| 89 | + |
| 90 | +--------------------------------------------------------------------------------------------------------------------------- |
| 91 | +# Proteoforms |
| 92 | + |
| 93 | +* PTM sets per PhysicalEntity, including proteins without modifications (include empty sets) |
| 94 | +~~~~ |
| 95 | +MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"}) |
| 96 | +WITH re, pe |
| 97 | +OPTIONAL MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe) |
| 98 | +WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens' |
| 99 | +WITH DISTINCT re.identifier as protein, pe, collect(DISTINCT p.stId) as pathwaySet |
| 100 | +OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(mr) |
| 101 | +RETURN DISTINCT protein, pe.stId, pathwaySet, collect(mr.coordinate) as ptmSet |
| 102 | +ORDER BY protein |
| 103 | +~~~~ |
| 104 | + |
| 105 | +* Count PTMSets for each protein: |
| 106 | +~~~~ |
| 107 | +MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"}) |
| 108 | +WITH re, pe |
| 109 | +OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(mr) |
| 110 | +WITH DISTINCT re.identifier as protein, pe, collect(mr.coordinate) as ptmSet |
| 111 | +WITH DISTINCT protein, size(collect(ptmSet)) as ptmSetCount |
| 112 | +RETURN min(ptmSetCount), avg(ptmSetCount), max(ptmSetCount) |
| 113 | +~~~~ |
| 114 | + |
| 115 | +* Get all proteoforms: 13812 |
| 116 | +~~~~ |
| 117 | +MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'}) |
| 118 | +WITH DISTINCT pe, re |
| 119 | +OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod) |
| 120 | +WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type |
| 121 | +ORDER BY type, coordinate |
| 122 | +WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms |
| 123 | +WITH DISTINCT pe, re, ptms |
| 124 | +RETURN DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, collect(DISTINCT pe.stId) as equivalentPe, ptms |
| 125 | +~~~~ |
| 126 | + |
| 127 | +* Number of reactions of each proteoform: |
| 128 | +~~~~ |
| 129 | +MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'}) |
| 130 | +WITH DISTINCT pe, re |
| 131 | +OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod) |
| 132 | +WITH DISTINCT pe, |
| 133 | + re, |
| 134 | + tm.coordinate as coordinate, |
| 135 | + mod.identifier as type |
| 136 | +ORDER BY type, coordinate |
| 137 | +WITH DISTINCT |
| 138 | + pe, |
| 139 | + re, |
| 140 | + COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms |
| 141 | +WITH DISTINCT pe, re, ptms |
| 142 | +OPTIONAL MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe) |
| 143 | +WHERE r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens' |
| 144 | +RETURN DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, ptms, size(collect(DISTINCT r.stId)) as reactionCount |
| 145 | +ORDER BY protein |
| 146 | +~~~~ |
| 147 | + |
| 148 | +* Statistics of reactions of each proteoform: |
| 149 | +~~~~ |
| 150 | +MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'}) |
| 151 | +WITH DISTINCT pe, re |
| 152 | +OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod) |
| 153 | +WITH DISTINCT pe, |
| 154 | + re, |
| 155 | + tm.coordinate as coordinate, |
| 156 | + mod.identifier as type |
| 157 | +ORDER BY type, coordinate |
| 158 | +WITH DISTINCT |
| 159 | + pe, |
| 160 | + re, |
| 161 | + COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms |
| 162 | +WITH DISTINCT pe, re, ptms |
| 163 | +OPTIONAL MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe) |
| 164 | +WHERE r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens' |
| 165 | +WITH DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, collect(DISTINCT pe.stId) as equivalentPe, ptms, size(collect(DISTINCT r.stId)) as reactionCount |
| 166 | +RETURN min(reactionCount) as minReactionCount, avg(reactionCount) as avgReactionCount, max(reactionCount) as maxReactionCount |
| 167 | +~~~~ |
| 168 | + |
| 169 | +* Number of pathways for each proteoform: |
| 170 | +~~~~ |
| 171 | +MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'}) |
| 172 | +WITH DISTINCT pe, re |
| 173 | +OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod) |
| 174 | +WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type |
| 175 | +ORDER BY type, coordinate |
| 176 | +WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms |
| 177 | +WITH DISTINCT pe, re, ptms |
| 178 | +MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe) |
| 179 | +WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens' |
| 180 | +RETURN DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, ptms, size(collect(DISTINCT p.stId)) as pathwayCount |
| 181 | +~~~~ |
| 182 | + |
| 183 | +* Statistics of pathways for each proteoform: |
| 184 | +~~~ |
| 185 | +MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'}) |
| 186 | +WITH DISTINCT pe, re |
| 187 | +OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod) |
| 188 | +WITH DISTINCT pe, |
| 189 | + re, |
| 190 | + tm.coordinate as coordinate, |
| 191 | + mod.identifier as type |
| 192 | +ORDER BY type, coordinate |
| 193 | +WITH DISTINCT |
| 194 | + pe, |
| 195 | + re, |
| 196 | + COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms |
| 197 | +WITH DISTINCT pe, re, ptms |
| 198 | +OPTIONAL MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe) |
| 199 | +WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens' |
| 200 | +WITH DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, collect(DISTINCT pe.stId) as equivalentPe, ptms, size(collect(DISTINCT p.stId)) as pathwayCount |
| 201 | +RETURN min(pathwayCount) as minPathwayCount, avg(pathwayCount) as avgPathwayCount, max(pathwayCount) as maxPathwayCount |
| 202 | +~~~ |
| 203 | + |
| 204 | +### Results |
| 205 | + |
| 206 | +* PTM sets per protein |
| 207 | + |
| 208 | +|min|average|max| |
| 209 | +|---|---|---| |
| 210 | +|1|2.232|316| |
| 211 | + |
| 212 | +* Reactions per proteoform |
| 213 | + |
| 214 | +|min|average|max| |
| 215 | +|---|---|---| |
| 216 | +|0|6.128|313| |
| 217 | + |
| 218 | +* Reactions per proteoform with at least one reaction annotated |
| 219 | + |
| 220 | +|min|average|max| |
| 221 | +|---|---|---| |
| 222 | +|1|6.494|313| |
| 223 | + |
| 224 | +* Pathways per proteoform |
| 225 | + |
| 226 | +|min|average|max| |
| 227 | +|---|---|---| |
| 228 | +|0|7.66|291| |
| 229 | + |
| 230 | +* Pathways per proteoform with at least one pathway annotated |
| 231 | + |
| 232 | +|min|average|max| |
| 233 | +|---|---|---| |
| 234 | +|1|8.122|291| |
| 235 | + |
| 236 | +* Note: Only counting the number of pathways and reactions for the proteins that actually have at least one reaction and pathway annotated. Because in the query, I am not using optional match for the pathway and reaction connection. Use the "OPTIONAL MATCH" for all proteins. |
| 237 | +* Note: Finds all the pathways using the reactions, no matter the level in the pathway hierarchy. |
| 238 | + |
| 239 | + |
| 240 | +* Top proteins participating in most reactions: |
| 241 | +~~~~ |
| 242 | +MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"}) |
| 243 | +WITH re, pe |
| 244 | +MATCH (r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe) |
| 245 | +WHERE r.speciesName = 'Homo sapiens' |
| 246 | +WITH re.identifier as protein, re.displayName as name, re.description as description, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT r.stId) as reactionSet |
| 247 | +RETURN DISTINCT protein, name, description, size(reactionSet) as reactionCount |
| 248 | +ORDER BY reactionCount DESC |
| 249 | +LIMIT 10 |
| 250 | +~~~~ |
| 251 | + |
| 252 | +* Top proteins participating in most pathways ant its containing reactions: |
| 253 | +~~~~ |
| 254 | +MATCH (re:ReferenceEntity{databaseName:"UniProt"})<-[:referenceEntity]-(pe:PhysicalEntity{speciesName:"Homo sapiens"}) |
| 255 | +WITH re, pe |
| 256 | +OPTIONAL MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe) |
| 257 | +WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens' |
| 258 | +WITH re.identifier as protein, re.displayName as name, re.description as description, collect(DISTINCT pe.stId) as peSet, collect(DISTINCT r.stId) as reactionSet, collect(DISTINCT p.stId) as pathwaySet |
| 259 | +RETURN DISTINCT protein, name, description, size(reactionSet) as reactionCount, size(pathwaySet) as pathwayCount |
| 260 | +ORDER BY pathwayCount DESC, reactionCount DESC, protein, name |
| 261 | +LIMIT 10 |
| 262 | +~~~~ |
| 263 | + |
| 264 | +* Top proteoforms participating in most pathways and its containing reactions: |
| 265 | +~~~ |
| 266 | +MATCH (pe:PhysicalEntity{speciesName:'Homo sapiens'})-[:referenceEntity]->(re:ReferenceEntity{databaseName:'UniProt'}) |
| 267 | +WITH DISTINCT pe, re |
| 268 | +OPTIONAL MATCH (pe)-[:hasModifiedResidue]->(tm:TranslationalModification)-[:psiMod]->(mod:PsiMod) |
| 269 | +WITH DISTINCT pe, re, tm.coordinate as coordinate, mod.identifier as type |
| 270 | +ORDER BY type, coordinate |
| 271 | +WITH DISTINCT pe, re, COLLECT(CASE WHEN coordinate IS NOT NULL THEN coordinate ELSE "null" END + ":" + type) AS ptms |
| 272 | +WITH DISTINCT pe, re, ptms |
| 273 | +MATCH (p:Pathway)-[:hasEvent*]->(r:Reaction)-[:input|output|catalystActivity|physicalEntity|regulatedBy|regulator|hasComponent|hasMember|hasCandidate*]->(pe) |
| 274 | +WHERE p.speciesName = 'Homo sapiens' AND r.speciesName = 'Homo sapiens' AND pe.speciesName = 'Homo sapiens' |
| 275 | +RETURN DISTINCT CASE WHEN re.variantIdentifier IS NOT NULL THEN re.variantIdentifier ELSE re.identifier END as protein, re.displayName as name, re.description as description, ptms, collect(DISTINCT pe.stId) as peSet, size(collect(DISTINCT r.stId)) as reactionCount, size(collect(DISTINCT p.stId)) as pathwayCount |
| 276 | +ORDER BY pathwayCount DESC, reactionCount DESC, protein, name |
| 277 | +LIMIT 10 |
| 278 | +~~~ |
| 279 | + |
0 commit comments