Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
rafguns committed Oct 9, 2013
0 parents commit cbf7499
Show file tree
Hide file tree
Showing 39 changed files with 4,352 additions and 0 deletions.
35 changes: 35 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
*.py[cod]

# C extensions
*.so

# Packages
*.egg
*.egg-info
dist
build
eggs
parts
bin
var
sdist
develop-eggs
.installed.cfg
lib
lib64

# Installer logs
pip-log.txt

# Unit test / coverage reports
.coverage
.tox
nosetests.xml

# Translations
*.mo

# Mr Developer
.mr.developer.cfg
.project
.pydevproject
1 change: 1 addition & 0 deletions linkpred/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""linkpred, a Python module for link prediction"""
140 changes: 140 additions & 0 deletions linkpred/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import json
from optparse import OptionParser

from .predictors import all_predictors
from .util import log

__all__ = ["load_profile", "get_profile", "get_profile_by_options",
"options_n_args"]


def data_from_profile(fname):
data = {}
try:
with open(fname) as f:
if fname.endswith(".yaml"):
import yaml
data = yaml.safe_load(f)
else:
data = json.load(f)
except (AttributeError, TypeError) as e:
log.logger.warning("Encountered error '%s'" % e)
finally:
return data


def fancy_update(base, new):
updated = dict(base.iteritems())
for k, v in new.iteritems():
if k not in base:
updated[k] = v
elif type(base[k]) == type(v) == dict:
updated[k] = fancy_update(base[k], v)
elif type(base[k]) == type(v) == list:
updated[k].extend(v)
else:
updated[k] = v
return updated


def load_profile(*fnames):
"""
Load profile from one or more files
Arguments
---------
fnames : one or more strings (file names)
"""
profile = {}
for fname in fnames:
data = data_from_profile(fname)
profile = fancy_update(profile, data)
return profile


def get_profile(**kwargs):
options, args = options_n_args(**kwargs)
if args:
log.logger.warning("Ignoring arguments: %s" % str(args))
return get_profile_by_options(options)


def get_profile_by_options(options):
"""Determine a profile based on available options
If multiple profiles are passed through the CLI interface,
they are merged into one. In case of conflicts, the last profile
supersedes the previous ones.
Other CLI options supersede the profiles.
Arguments
---------
options : an optparse.Options object
Returns
-------
profile : a dict
"""
profile = load_profile(*options.profile)

option_names = ["charts", "filetype", "interpolation", "steps", "only_new"]
for option_name in option_names:
try:
option = getattr(options, option_name)
except AttributeError:
continue
profile[option_name] = option

if hasattr(options, 'predictors') and options.predictors:
profile['predictors'] = []
for p in options.predictors:
profile['predictors'].append({'name': p})

return profile


def options_n_args(choose_chart=True, choose_profile=True,
choose_predictor=True, choose_filetype=False,
choose_weight=False, choose_interpolation=False):
"""Get nice CLI interface and return options 'n arguments."""

parser = OptionParser()
parser.add_option("--debug", action="store_true", dest="debug",
default=False, help="Log debug messages")
if choose_chart:
chart_help = "Type of chart(s) to produce (default: all available)."
chart_types = ["recall-precision", "F-score", "ROC"]
parser.add_option("-c", "--chart", help=chart_help, action="append",
choices=chart_types, dest="charts", default=chart_types)
if choose_filetype:
parser.add_option("-f", "--filetype",
help="Output file type (default: %default)", default="pdf")
if choose_interpolation:
parser.add_option("-i", "--no-interpolation",
help="Do not interpolate precision", action="store_false",
dest="interpolation", default=True)
if choose_predictor:
predictors = [p.__name__ for p in all_predictors()]
parser.add_option(
"-p", "--predictors", action="append", dest="predictors",
help="Predicting methods to use (default: all available)",
choices=predictors, default=[])
parser.add_option(
"-n", "--only-new", action="store_true", dest="only_new",
default=False,
help="Only consider new (unattested) predictions")
if choose_profile:
parser.add_option("-P", "--profile", action="append",
help="JSON profile file", default=[])

options, args = parser.parse_args()
if options.debug:
log.logger.setLevel(log.logging.DEBUG)
else:
log.logger.setLevel(log.logging.INFO)

return options, args
150 changes: 150 additions & 0 deletions linkpred/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from . import predictors
from .evaluation import Comparison
from .result import ResultDict # XXX
from .util import log


def training_test_data(profile, minimum_degree=1, **kwargs):
endpoint = profile["sparql_endpoint"]
query = profile.get("query")
training = profile["training"]
test = profile["test"]

results = ResultDict()
for dataprofile in (training, test):
name = dataprofile["name"]
parameters = dataprofile["parameters"]
if query is None:
query = dataprofile["query"]

log.logger.info("Collecting data (%s)..." % name)
# XXX TODO XXX
results[name] = "TODO"
log.logger.info("Finished collecting data.")

if minimum_degree:
results.filter_all_low_degree_nodes(minimum_degree)
return results[training['name']], results[test['name']]


def pretty_print(name, bipartite=False, tfidf=False, params={}):
"""Pretty print a predictor name"""
retval = name
if bipartite:
retval += " bipartite"
if tfidf:
retval += " TF-IDF"
if not params:
return retval

pretty_params = ", ".join("%s = %s" % (k, str(v))
for k, v in params.iteritems())
return "%s (%s)" % (retval, pretty_params)


def to_tfidf(G):
"""TF-IDF transform the edges of G
This is done by transforming its adjacency matrix and then converting back
to a network.
"""
import networkx as nx
from linkpred.matrix import tfidf_matrix
from linkpred.network import from_biadjacency_matrix

assert nx.is_bipartite(G)
row_items = [n for n in G.nodes_iter() if G.node[n]['eligible']]
col_items = [n for n in G.nodes_iter() if not G.node[n]['eligible']]
matrix = nx.bipartite.biadjacency_matrix(G, row_items, col_items)
matrix = tfidf_matrix(matrix)
G2 = from_biadjacency_matrix(matrix, row_items, col_items)
G2.node = G.node
return G2


def do_predict(G, predictortype, label, eligible=None, only_new=False, **kwargs):
log.logger.info("Executing %s..." % label)
predictor = predictortype(G, eligible=eligible, only_new=only_new)
scoresheet = predictor(**kwargs)
log.logger.info("Finished executing %s." % label)
return scoresheet


def predict(training, profile, only_new=False, eligible=None):
"""Generator that yields predictions on the basis of training
Arguments
---------
training : a Result
Training data
profile : a dict
Profile detailing which predictors should be used
only_new : True|False
Whether or not we should restrict ourselves to predicting only new links
eligible : a string or None
If a string, the attribute according to which 'eligible' nodes are found.
If None, this is ignored.
Returns
-------
(label, scoresheet) : a 2-tuple
2-tuple consisting of a string (label of the prediction) and
a Scoresheet (actual predictions)
"""
for predictor_profile in profile['predictors']:
bipartite = predictor_profile.get('bipartite', False)
tfidf = predictor_profile.get('tfidf', False)
parameters = predictor_profile.get('parameters', {})
name = predictor_profile['name']
predictortype = getattr(predictors, name)
label = predictor_profile.get('displayname',
pretty_print(name, bipartite, tfidf, parameters))

if bipartite and tfidf:
# Create a reusable TF-IDF network, so we don't have to do this
# transformation for each predictor.
if not hasattr(predict, 'tfidf_network'):
predict.tfidf_network = to_tfidf(training.pathspec)
G = predict.tfidf_network
elif bipartite:
G = training.pathspec
else:
G = training.network

scoresheet = do_predict(
G, predictortype, label, eligible, only_new, **parameters)

yield label, scoresheet


def connect_signals(listeners):
from linkpred.evaluation import signals
for listener in listeners:
signals.new_evaluation.connect(listener.on_new_evaluation)
signals.datagroup_finished.connect(listener.on_datagroup_finished)
signals.dataset_finished.connect(listener.on_dataset_finished)
signals.run_finished.connect(listener.on_run_finished)


def evaluate(datasets, name, filetype="pdf", interpolate=True, steps=1):
import linkpred.evaluation.listeners as l
# TODO figure out easy way to specify which listeners we want
cache = l.CachingListener()
rp = l.RecallPrecisionPlotter(name, filetype=filetype,
interpolate=interpolate)
f = l.FScorePlotter(name, filetype=filetype, xlabel="# predictions",
steps=steps)
roc = l.ROCPlotter(name, filetype=filetype)
fmax = l.FMaxListener(name)
connect_signals((cache, rp, f, roc, fmax))

comp = Comparison()
try:
comp.register_datasets(datasets)
except TypeError: # Oops, not iterable!
comp.register_dataset(datasets)
comp.run()
5 changes: 5 additions & 0 deletions linkpred/evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Module for evaluating link prediction results"""
from .comparison import Comparison, DataSet
from .listeners import *
from .scoresheet import *
from .signals import *
49 changes: 49 additions & 0 deletions linkpred/evaluation/comparison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from .ranked import ranked_evaluation
from .signals import new_evaluation, datagroup_finished,\
dataset_finished, run_finished
from ..util import log

__all__ = ["DataSet", "Comparison"]


class DataSet(object):
def __init__(self, name, predictions, test, exclude=set(), steps=1):
self.name = name
self.predictions = predictions
self.test = test.for_comparison(exclude=exclude)
self.steps = steps
nnodes = len(test)
# Universe = all possible edges, except for the ones that we no longer
# consider (because they're already in the training network)
self.num_universe = nnodes * (nnodes - 1) / 2 - len(exclude)
log.logger.debug("Constructed dataset '%s': "
"num_universe = %d" % (self.name, self.num_universe))


class Comparison(object):

def __init__(self):
self.datasets = []

def __iter__(self):
return iter(self.datasets)

def register_dataset(self, dataset):
self.datasets.append(dataset)

def register_datasets(self, datasets):
for d in datasets:
self.register_dataset(d)

def run(self):
for d in self.datasets:
for predictorname, scoresheet in d.predictions:
for evaluation in ranked_evaluation(scoresheet, d.test,
n=d.steps,
universe=d.num_universe):
new_evaluation.send(sender=self, evaluation=evaluation,
dataset=d.name, predictor=predictorname)
datagroup_finished.send(sender=self, dataset=d.name,
predictor=predictorname)
dataset_finished.send(sender=self, dataset=d.name)
run_finished.send(sender=self)
Loading

0 comments on commit cbf7499

Please sign in to comment.