Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First pass at semantic similarity #49

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 40 additions & 3 deletions ontobio/ontol.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,12 +598,49 @@ def logical_definitions(self, nid):
else:
return []

def definition(self, nid):
"""
Text definition object for a node

Arguments
---------
nid : str
Node identifier for entity to be queried

Return
------
dict
definition object, dict(val=TEXT, xrefs=LIST)
"""
return self._get_meta_prop(nid, 'definition')

def definition_val(self, nid):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like this should just be a method on whatever is returned from self.definition. So if you want the string version, you would just say str(definition(id)) instead of providing a separate method here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the most common operation is getting the value; but in any case this will be dropped as I accidentlly reimplemented this code on a separate PR that's already been merged, doh

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh it's merged already? Where at? I was thinking we would release after this PR to get my logging changes in and this.

Oh, I just saw that it's just a dictionary that's being returned and not a real Object. I guess my suggestion doesn't make as much sense then.

"""
Text definition string value for a node

Arguments
---------
nid : str
Node identifier for entity to be queried

Return
------
str
text definition
"""
defn = self.definition(nid)
if defn is None:
return None
else:
return defn['val']


def get_node_type(self, nid):
n = self.node(nid)
if 'type' in n:
return n['type']
return None

def _get_meta_prop(self, nid, prop):
n = self.node(nid)
if 'meta' in n:
Expand Down Expand Up @@ -763,7 +800,7 @@ def xrefs(self, nid, bidirectional=False):
nid : str
Node identifier for entity to be queried
bidirection : bool
If True, include nodes xreffed to nid
If True, include nodes that xref nid

Return
------
Expand All @@ -773,7 +810,7 @@ def xrefs(self, nid, bidirectional=False):
xg = self.xref_graph
if nid not in xg:
return []
if bidirectional:
elif bidirectional:
return xg.neighbors(nid)
else:
return [x for x in xg.neighbors(nid) if xg[nid][x]['source'] == nid]
Expand Down
105 changes: 105 additions & 0 deletions ontobio/sim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import math
import pandas as pd

class SimEngine():

def __init__(self,
association_set=None,
icmap=None):
self.association_set = association_set
self.icmap = icmap

def _get_icmap(self):
if self.icmap is None:
icmap = {}
aset = self.association_set
num_subjs = len(asset.subjects)
for n in aset.ontology.nodes():
num_anns = len(aset.query([n]))
freq = num_anns / num_subjs
ic = None
if freq > 0:
ic = -math.log(freq/num_subjs) / math.log(2)
icmap[n] = ic
self.icmap = icmap
return self.icmap

def information_content(self,nid):
"""
Returns information content for a node
"""
icmap = self._get_icmap()
return icmap[nid]

def entity_jaccard_similarity(self,s1,s2):
"""
Calculate jaccard index of inferred associations of two subjects

|ancs(s1) /\ ancs(s2)|
---
|ancs(s1) \/ ancs(s2)|

"""
a1 = self.association_set.inferred_types(s1)
a2 = self.association_set.inferred_types(s2)
num_union = len(a1.union(a2))
if num_union == 0:
return 0.0
return len(a1.intersection(a2)) / num_union

def class_jaccard_similarity(self,c1,c2):
"""
Calculate jaccard index of two classes

|ancs(c1) /\ ancs(c2)|
---
|ancs(c1) \/ ancs(c2)|

"""
ont = self.association_set.ontology
a1 = ont.ancestors(c1,reflexive=True)
a2 = ont.ancestors(c2,reflexive=True)
num_union = len(a1.union(a2))
if num_union == 0:
return 0.0
return len(a1.intersection(a2)) / num_union

def class_resnik_similarity(self,c1,c2):
"""
Calculate resnik similarty of two classes

Return
------
(number,list)
tuple of max_ic and list of MRCAs
"""
cas = self.common_ancestors(c1,c2)
pairs = [(a, self.information_content(a)) for a in cas]
max_ic = 0
mrcas = []
for a,ic in pairs:
if ic > max_ic:
max_ic = ic
mrcas = [a]
elif ic == max_ic:
mrcas.append(a)
return max_ic, mrcas

def used_classes(self):
aset = self.association_set
cset = set()
for s in aset.subjects:
cset.update(aset.inferred_types(s))
return cset

def dataframe(self):
aset = self.association_set
entries = []
subjs = aset.subjects
for s in subjs:
vmap = {}
for c in aset.inferred_types(s):
vmap[c] = 1
entries.append(vmap)
df = pd.DataFrame(entries, index=subjs)
return df
5 changes: 5 additions & 0 deletions tests/test_local_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,11 @@ def test_graph():
assert NIF_CELL in xrefs
assert len(xrefs) == 2

def_val = ont.definition_val(CELL)
assert def_val.startswith("The basic structural and functional unit of all organisms")

defn = ont.definition(CELL)
assert defn['xrefs'] == [ "GOC:go_curators" ]

# xrefs are bidirectional
xrefs = ont.xrefs(WIKIPEDIA_CELL, bidirectional=True)
Expand Down
34 changes: 34 additions & 0 deletions tests/test_sim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from ontobio.ontol_factory import OntologyFactory
from ontobio.assoc_factory import AssociationSetFactory
from ontobio.assocmodel import AssociationSet
from ontobio.io.gafparser import GafParser
from ontobio.sim import SimEngine
import logging
import random



POMBASE = "tests/resources/truncated-pombase.gaf"
INTRACELLULAR='GO:0005622'
G1 = 'PomBase:SPBC902.04'
def test_sim():
"""
Test loading from gaf
"""
ofactory = OntologyFactory()
afactory = AssociationSetFactory()
ont = ofactory.create('tests/resources/go-truncated-pombase.json')
aset = afactory.create_from_gaf(open(POMBASE,"r"),
ontology=ont)

sim = SimEngine(aset)
for g1 in aset.subjects:
print("G1={} '{}'".format(g1, aset.label(g1)))
for g2 in aset.subjects:
print(" G2={} '{}'".format(g2, aset.label(g2)))
jsim = sim.entity_jaccard_similarity(g1,g2)
print(" SIM={}".format(jsim))