-
Notifications
You must be signed in to change notification settings - Fork 46
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit cbf7499
Showing
39 changed files
with
4,352 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
*.py[cod] | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Packages | ||
*.egg | ||
*.egg-info | ||
dist | ||
build | ||
eggs | ||
parts | ||
bin | ||
var | ||
sdist | ||
develop-eggs | ||
.installed.cfg | ||
lib | ||
lib64 | ||
|
||
# Installer logs | ||
pip-log.txt | ||
|
||
# Unit test / coverage reports | ||
.coverage | ||
.tox | ||
nosetests.xml | ||
|
||
# Translations | ||
*.mo | ||
|
||
# Mr Developer | ||
.mr.developer.cfg | ||
.project | ||
.pydevproject |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""linkpred, a Python module for link prediction""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
import json | ||
from optparse import OptionParser | ||
|
||
from .predictors import all_predictors | ||
from .util import log | ||
|
||
__all__ = ["load_profile", "get_profile", "get_profile_by_options", | ||
"options_n_args"] | ||
|
||
|
||
def data_from_profile(fname): | ||
data = {} | ||
try: | ||
with open(fname) as f: | ||
if fname.endswith(".yaml"): | ||
import yaml | ||
data = yaml.safe_load(f) | ||
else: | ||
data = json.load(f) | ||
except (AttributeError, TypeError) as e: | ||
log.logger.warning("Encountered error '%s'" % e) | ||
finally: | ||
return data | ||
|
||
|
||
def fancy_update(base, new): | ||
updated = dict(base.iteritems()) | ||
for k, v in new.iteritems(): | ||
if k not in base: | ||
updated[k] = v | ||
elif type(base[k]) == type(v) == dict: | ||
updated[k] = fancy_update(base[k], v) | ||
elif type(base[k]) == type(v) == list: | ||
updated[k].extend(v) | ||
else: | ||
updated[k] = v | ||
return updated | ||
|
||
|
||
def load_profile(*fnames): | ||
""" | ||
Load profile from one or more files | ||
Arguments | ||
--------- | ||
fnames : one or more strings (file names) | ||
""" | ||
profile = {} | ||
for fname in fnames: | ||
data = data_from_profile(fname) | ||
profile = fancy_update(profile, data) | ||
return profile | ||
|
||
|
||
def get_profile(**kwargs): | ||
options, args = options_n_args(**kwargs) | ||
if args: | ||
log.logger.warning("Ignoring arguments: %s" % str(args)) | ||
return get_profile_by_options(options) | ||
|
||
|
||
def get_profile_by_options(options): | ||
"""Determine a profile based on available options | ||
If multiple profiles are passed through the CLI interface, | ||
they are merged into one. In case of conflicts, the last profile | ||
supersedes the previous ones. | ||
Other CLI options supersede the profiles. | ||
Arguments | ||
--------- | ||
options : an optparse.Options object | ||
Returns | ||
------- | ||
profile : a dict | ||
""" | ||
profile = load_profile(*options.profile) | ||
|
||
option_names = ["charts", "filetype", "interpolation", "steps", "only_new"] | ||
for option_name in option_names: | ||
try: | ||
option = getattr(options, option_name) | ||
except AttributeError: | ||
continue | ||
profile[option_name] = option | ||
|
||
if hasattr(options, 'predictors') and options.predictors: | ||
profile['predictors'] = [] | ||
for p in options.predictors: | ||
profile['predictors'].append({'name': p}) | ||
|
||
return profile | ||
|
||
|
||
def options_n_args(choose_chart=True, choose_profile=True, | ||
choose_predictor=True, choose_filetype=False, | ||
choose_weight=False, choose_interpolation=False): | ||
"""Get nice CLI interface and return options 'n arguments.""" | ||
|
||
parser = OptionParser() | ||
parser.add_option("--debug", action="store_true", dest="debug", | ||
default=False, help="Log debug messages") | ||
if choose_chart: | ||
chart_help = "Type of chart(s) to produce (default: all available)." | ||
chart_types = ["recall-precision", "F-score", "ROC"] | ||
parser.add_option("-c", "--chart", help=chart_help, action="append", | ||
choices=chart_types, dest="charts", default=chart_types) | ||
if choose_filetype: | ||
parser.add_option("-f", "--filetype", | ||
help="Output file type (default: %default)", default="pdf") | ||
if choose_interpolation: | ||
parser.add_option("-i", "--no-interpolation", | ||
help="Do not interpolate precision", action="store_false", | ||
dest="interpolation", default=True) | ||
if choose_predictor: | ||
predictors = [p.__name__ for p in all_predictors()] | ||
parser.add_option( | ||
"-p", "--predictors", action="append", dest="predictors", | ||
help="Predicting methods to use (default: all available)", | ||
choices=predictors, default=[]) | ||
parser.add_option( | ||
"-n", "--only-new", action="store_true", dest="only_new", | ||
default=False, | ||
help="Only consider new (unattested) predictions") | ||
if choose_profile: | ||
parser.add_option("-P", "--profile", action="append", | ||
help="JSON profile file", default=[]) | ||
|
||
options, args = parser.parse_args() | ||
if options.debug: | ||
log.logger.setLevel(log.logging.DEBUG) | ||
else: | ||
log.logger.setLevel(log.logging.INFO) | ||
|
||
return options, args |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
from . import predictors | ||
from .evaluation import Comparison | ||
from .result import ResultDict # XXX | ||
from .util import log | ||
|
||
|
||
def training_test_data(profile, minimum_degree=1, **kwargs): | ||
endpoint = profile["sparql_endpoint"] | ||
query = profile.get("query") | ||
training = profile["training"] | ||
test = profile["test"] | ||
|
||
results = ResultDict() | ||
for dataprofile in (training, test): | ||
name = dataprofile["name"] | ||
parameters = dataprofile["parameters"] | ||
if query is None: | ||
query = dataprofile["query"] | ||
|
||
log.logger.info("Collecting data (%s)..." % name) | ||
# XXX TODO XXX | ||
results[name] = "TODO" | ||
log.logger.info("Finished collecting data.") | ||
|
||
if minimum_degree: | ||
results.filter_all_low_degree_nodes(minimum_degree) | ||
return results[training['name']], results[test['name']] | ||
|
||
|
||
def pretty_print(name, bipartite=False, tfidf=False, params={}): | ||
"""Pretty print a predictor name""" | ||
retval = name | ||
if bipartite: | ||
retval += " bipartite" | ||
if tfidf: | ||
retval += " TF-IDF" | ||
if not params: | ||
return retval | ||
|
||
pretty_params = ", ".join("%s = %s" % (k, str(v)) | ||
for k, v in params.iteritems()) | ||
return "%s (%s)" % (retval, pretty_params) | ||
|
||
|
||
def to_tfidf(G): | ||
"""TF-IDF transform the edges of G | ||
This is done by transforming its adjacency matrix and then converting back | ||
to a network. | ||
""" | ||
import networkx as nx | ||
from linkpred.matrix import tfidf_matrix | ||
from linkpred.network import from_biadjacency_matrix | ||
|
||
assert nx.is_bipartite(G) | ||
row_items = [n for n in G.nodes_iter() if G.node[n]['eligible']] | ||
col_items = [n for n in G.nodes_iter() if not G.node[n]['eligible']] | ||
matrix = nx.bipartite.biadjacency_matrix(G, row_items, col_items) | ||
matrix = tfidf_matrix(matrix) | ||
G2 = from_biadjacency_matrix(matrix, row_items, col_items) | ||
G2.node = G.node | ||
return G2 | ||
|
||
|
||
def do_predict(G, predictortype, label, eligible=None, only_new=False, **kwargs): | ||
log.logger.info("Executing %s..." % label) | ||
predictor = predictortype(G, eligible=eligible, only_new=only_new) | ||
scoresheet = predictor(**kwargs) | ||
log.logger.info("Finished executing %s." % label) | ||
return scoresheet | ||
|
||
|
||
def predict(training, profile, only_new=False, eligible=None): | ||
"""Generator that yields predictions on the basis of training | ||
Arguments | ||
--------- | ||
training : a Result | ||
Training data | ||
profile : a dict | ||
Profile detailing which predictors should be used | ||
only_new : True|False | ||
Whether or not we should restrict ourselves to predicting only new links | ||
eligible : a string or None | ||
If a string, the attribute according to which 'eligible' nodes are found. | ||
If None, this is ignored. | ||
Returns | ||
------- | ||
(label, scoresheet) : a 2-tuple | ||
2-tuple consisting of a string (label of the prediction) and | ||
a Scoresheet (actual predictions) | ||
""" | ||
for predictor_profile in profile['predictors']: | ||
bipartite = predictor_profile.get('bipartite', False) | ||
tfidf = predictor_profile.get('tfidf', False) | ||
parameters = predictor_profile.get('parameters', {}) | ||
name = predictor_profile['name'] | ||
predictortype = getattr(predictors, name) | ||
label = predictor_profile.get('displayname', | ||
pretty_print(name, bipartite, tfidf, parameters)) | ||
|
||
if bipartite and tfidf: | ||
# Create a reusable TF-IDF network, so we don't have to do this | ||
# transformation for each predictor. | ||
if not hasattr(predict, 'tfidf_network'): | ||
predict.tfidf_network = to_tfidf(training.pathspec) | ||
G = predict.tfidf_network | ||
elif bipartite: | ||
G = training.pathspec | ||
else: | ||
G = training.network | ||
|
||
scoresheet = do_predict( | ||
G, predictortype, label, eligible, only_new, **parameters) | ||
|
||
yield label, scoresheet | ||
|
||
|
||
def connect_signals(listeners): | ||
from linkpred.evaluation import signals | ||
for listener in listeners: | ||
signals.new_evaluation.connect(listener.on_new_evaluation) | ||
signals.datagroup_finished.connect(listener.on_datagroup_finished) | ||
signals.dataset_finished.connect(listener.on_dataset_finished) | ||
signals.run_finished.connect(listener.on_run_finished) | ||
|
||
|
||
def evaluate(datasets, name, filetype="pdf", interpolate=True, steps=1): | ||
import linkpred.evaluation.listeners as l | ||
# TODO figure out easy way to specify which listeners we want | ||
cache = l.CachingListener() | ||
rp = l.RecallPrecisionPlotter(name, filetype=filetype, | ||
interpolate=interpolate) | ||
f = l.FScorePlotter(name, filetype=filetype, xlabel="# predictions", | ||
steps=steps) | ||
roc = l.ROCPlotter(name, filetype=filetype) | ||
fmax = l.FMaxListener(name) | ||
connect_signals((cache, rp, f, roc, fmax)) | ||
|
||
comp = Comparison() | ||
try: | ||
comp.register_datasets(datasets) | ||
except TypeError: # Oops, not iterable! | ||
comp.register_dataset(datasets) | ||
comp.run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
"""Module for evaluating link prediction results""" | ||
from .comparison import Comparison, DataSet | ||
from .listeners import * | ||
from .scoresheet import * | ||
from .signals import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
from .ranked import ranked_evaluation | ||
from .signals import new_evaluation, datagroup_finished,\ | ||
dataset_finished, run_finished | ||
from ..util import log | ||
|
||
__all__ = ["DataSet", "Comparison"] | ||
|
||
|
||
class DataSet(object): | ||
def __init__(self, name, predictions, test, exclude=set(), steps=1): | ||
self.name = name | ||
self.predictions = predictions | ||
self.test = test.for_comparison(exclude=exclude) | ||
self.steps = steps | ||
nnodes = len(test) | ||
# Universe = all possible edges, except for the ones that we no longer | ||
# consider (because they're already in the training network) | ||
self.num_universe = nnodes * (nnodes - 1) / 2 - len(exclude) | ||
log.logger.debug("Constructed dataset '%s': " | ||
"num_universe = %d" % (self.name, self.num_universe)) | ||
|
||
|
||
class Comparison(object): | ||
|
||
def __init__(self): | ||
self.datasets = [] | ||
|
||
def __iter__(self): | ||
return iter(self.datasets) | ||
|
||
def register_dataset(self, dataset): | ||
self.datasets.append(dataset) | ||
|
||
def register_datasets(self, datasets): | ||
for d in datasets: | ||
self.register_dataset(d) | ||
|
||
def run(self): | ||
for d in self.datasets: | ||
for predictorname, scoresheet in d.predictions: | ||
for evaluation in ranked_evaluation(scoresheet, d.test, | ||
n=d.steps, | ||
universe=d.num_universe): | ||
new_evaluation.send(sender=self, evaluation=evaluation, | ||
dataset=d.name, predictor=predictorname) | ||
datagroup_finished.send(sender=self, dataset=d.name, | ||
predictor=predictorname) | ||
dataset_finished.send(sender=self, dataset=d.name) | ||
run_finished.send(sender=self) |
Oops, something went wrong.