diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d2d6f36 --- /dev/null +++ b/.gitignore @@ -0,0 +1,35 @@ +*.py[cod] + +# C extensions +*.so + +# Packages +*.egg +*.egg-info +dist +build +eggs +parts +bin +var +sdist +develop-eggs +.installed.cfg +lib +lib64 + +# Installer logs +pip-log.txt + +# Unit test / coverage reports +.coverage +.tox +nosetests.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject diff --git a/linkpred/__init__.py b/linkpred/__init__.py new file mode 100644 index 0000000..308afc8 --- /dev/null +++ b/linkpred/__init__.py @@ -0,0 +1 @@ +"""linkpred, a Python module for link prediction""" diff --git a/linkpred/cli.py b/linkpred/cli.py new file mode 100644 index 0000000..9821a91 --- /dev/null +++ b/linkpred/cli.py @@ -0,0 +1,140 @@ +import json +from optparse import OptionParser + +from .predictors import all_predictors +from .util import log + +__all__ = ["load_profile", "get_profile", "get_profile_by_options", + "options_n_args"] + + +def data_from_profile(fname): + data = {} + try: + with open(fname) as f: + if fname.endswith(".yaml"): + import yaml + data = yaml.safe_load(f) + else: + data = json.load(f) + except (AttributeError, TypeError) as e: + log.logger.warning("Encountered error '%s'" % e) + finally: + return data + + +def fancy_update(base, new): + updated = dict(base.iteritems()) + for k, v in new.iteritems(): + if k not in base: + updated[k] = v + elif type(base[k]) == type(v) == dict: + updated[k] = fancy_update(base[k], v) + elif type(base[k]) == type(v) == list: + updated[k].extend(v) + else: + updated[k] = v + return updated + + +def load_profile(*fnames): + """ + Load profile from one or more files + + Arguments + --------- + fnames : one or more strings (file names) + + """ + profile = {} + for fname in fnames: + data = data_from_profile(fname) + profile = fancy_update(profile, data) + return profile + + +def get_profile(**kwargs): + options, args = options_n_args(**kwargs) + if args: + log.logger.warning("Ignoring arguments: %s" % str(args)) + return get_profile_by_options(options) + + +def get_profile_by_options(options): + """Determine a profile based on available options + + If multiple profiles are passed through the CLI interface, + they are merged into one. In case of conflicts, the last profile + supersedes the previous ones. + Other CLI options supersede the profiles. + + Arguments + --------- + + options : an optparse.Options object + + Returns + ------- + + profile : a dict + + """ + profile = load_profile(*options.profile) + + option_names = ["charts", "filetype", "interpolation", "steps", "only_new"] + for option_name in option_names: + try: + option = getattr(options, option_name) + except AttributeError: + continue + profile[option_name] = option + + if hasattr(options, 'predictors') and options.predictors: + profile['predictors'] = [] + for p in options.predictors: + profile['predictors'].append({'name': p}) + + return profile + + +def options_n_args(choose_chart=True, choose_profile=True, + choose_predictor=True, choose_filetype=False, + choose_weight=False, choose_interpolation=False): + """Get nice CLI interface and return options 'n arguments.""" + + parser = OptionParser() + parser.add_option("--debug", action="store_true", dest="debug", + default=False, help="Log debug messages") + if choose_chart: + chart_help = "Type of chart(s) to produce (default: all available)." + chart_types = ["recall-precision", "F-score", "ROC"] + parser.add_option("-c", "--chart", help=chart_help, action="append", + choices=chart_types, dest="charts", default=chart_types) + if choose_filetype: + parser.add_option("-f", "--filetype", + help="Output file type (default: %default)", default="pdf") + if choose_interpolation: + parser.add_option("-i", "--no-interpolation", + help="Do not interpolate precision", action="store_false", + dest="interpolation", default=True) + if choose_predictor: + predictors = [p.__name__ for p in all_predictors()] + parser.add_option( + "-p", "--predictors", action="append", dest="predictors", + help="Predicting methods to use (default: all available)", + choices=predictors, default=[]) + parser.add_option( + "-n", "--only-new", action="store_true", dest="only_new", + default=False, + help="Only consider new (unattested) predictions") + if choose_profile: + parser.add_option("-P", "--profile", action="append", + help="JSON profile file", default=[]) + + options, args = parser.parse_args() + if options.debug: + log.logger.setLevel(log.logging.DEBUG) + else: + log.logger.setLevel(log.logging.INFO) + + return options, args diff --git a/linkpred/core.py b/linkpred/core.py new file mode 100644 index 0000000..5c1c99a --- /dev/null +++ b/linkpred/core.py @@ -0,0 +1,150 @@ +from . import predictors +from .evaluation import Comparison +from .result import ResultDict # XXX +from .util import log + + +def training_test_data(profile, minimum_degree=1, **kwargs): + endpoint = profile["sparql_endpoint"] + query = profile.get("query") + training = profile["training"] + test = profile["test"] + + results = ResultDict() + for dataprofile in (training, test): + name = dataprofile["name"] + parameters = dataprofile["parameters"] + if query is None: + query = dataprofile["query"] + + log.logger.info("Collecting data (%s)..." % name) + # XXX TODO XXX + results[name] = "TODO" + log.logger.info("Finished collecting data.") + + if minimum_degree: + results.filter_all_low_degree_nodes(minimum_degree) + return results[training['name']], results[test['name']] + + +def pretty_print(name, bipartite=False, tfidf=False, params={}): + """Pretty print a predictor name""" + retval = name + if bipartite: + retval += " bipartite" + if tfidf: + retval += " TF-IDF" + if not params: + return retval + + pretty_params = ", ".join("%s = %s" % (k, str(v)) + for k, v in params.iteritems()) + return "%s (%s)" % (retval, pretty_params) + + +def to_tfidf(G): + """TF-IDF transform the edges of G + + This is done by transforming its adjacency matrix and then converting back + to a network. + """ + import networkx as nx + from linkpred.matrix import tfidf_matrix + from linkpred.network import from_biadjacency_matrix + + assert nx.is_bipartite(G) + row_items = [n for n in G.nodes_iter() if G.node[n]['eligible']] + col_items = [n for n in G.nodes_iter() if not G.node[n]['eligible']] + matrix = nx.bipartite.biadjacency_matrix(G, row_items, col_items) + matrix = tfidf_matrix(matrix) + G2 = from_biadjacency_matrix(matrix, row_items, col_items) + G2.node = G.node + return G2 + + +def do_predict(G, predictortype, label, eligible=None, only_new=False, **kwargs): + log.logger.info("Executing %s..." % label) + predictor = predictortype(G, eligible=eligible, only_new=only_new) + scoresheet = predictor(**kwargs) + log.logger.info("Finished executing %s." % label) + return scoresheet + + +def predict(training, profile, only_new=False, eligible=None): + """Generator that yields predictions on the basis of training + + Arguments + --------- + training : a Result + Training data + + profile : a dict + Profile detailing which predictors should be used + + only_new : True|False + Whether or not we should restrict ourselves to predicting only new links + + eligible : a string or None + If a string, the attribute according to which 'eligible' nodes are found. + If None, this is ignored. + + Returns + ------- + (label, scoresheet) : a 2-tuple + 2-tuple consisting of a string (label of the prediction) and + a Scoresheet (actual predictions) + + """ + for predictor_profile in profile['predictors']: + bipartite = predictor_profile.get('bipartite', False) + tfidf = predictor_profile.get('tfidf', False) + parameters = predictor_profile.get('parameters', {}) + name = predictor_profile['name'] + predictortype = getattr(predictors, name) + label = predictor_profile.get('displayname', + pretty_print(name, bipartite, tfidf, parameters)) + + if bipartite and tfidf: + # Create a reusable TF-IDF network, so we don't have to do this + # transformation for each predictor. + if not hasattr(predict, 'tfidf_network'): + predict.tfidf_network = to_tfidf(training.pathspec) + G = predict.tfidf_network + elif bipartite: + G = training.pathspec + else: + G = training.network + + scoresheet = do_predict( + G, predictortype, label, eligible, only_new, **parameters) + + yield label, scoresheet + + +def connect_signals(listeners): + from linkpred.evaluation import signals + for listener in listeners: + signals.new_evaluation.connect(listener.on_new_evaluation) + signals.datagroup_finished.connect(listener.on_datagroup_finished) + signals.dataset_finished.connect(listener.on_dataset_finished) + signals.run_finished.connect(listener.on_run_finished) + + +def evaluate(datasets, name, filetype="pdf", interpolate=True, steps=1): + import linkpred.evaluation.listeners as l + # TODO figure out easy way to specify which listeners we want + cache = l.CachingListener() + rp = l.RecallPrecisionPlotter(name, filetype=filetype, + interpolate=interpolate) + f = l.FScorePlotter(name, filetype=filetype, xlabel="# predictions", + steps=steps) + roc = l.ROCPlotter(name, filetype=filetype) + fmax = l.FMaxListener(name) + connect_signals((cache, rp, f, roc, fmax)) + + comp = Comparison() + try: + comp.register_datasets(datasets) + except TypeError: # Oops, not iterable! + comp.register_dataset(datasets) + comp.run() diff --git a/linkpred/evaluation/__init__.py b/linkpred/evaluation/__init__.py new file mode 100644 index 0000000..6870b0d --- /dev/null +++ b/linkpred/evaluation/__init__.py @@ -0,0 +1,5 @@ +"""Module for evaluating link prediction results""" +from .comparison import Comparison, DataSet +from .listeners import * +from .scoresheet import * +from .signals import * diff --git a/linkpred/evaluation/comparison.py b/linkpred/evaluation/comparison.py new file mode 100644 index 0000000..ed06338 --- /dev/null +++ b/linkpred/evaluation/comparison.py @@ -0,0 +1,49 @@ +from .ranked import ranked_evaluation +from .signals import new_evaluation, datagroup_finished,\ + dataset_finished, run_finished +from ..util import log + +__all__ = ["DataSet", "Comparison"] + + +class DataSet(object): + def __init__(self, name, predictions, test, exclude=set(), steps=1): + self.name = name + self.predictions = predictions + self.test = test.for_comparison(exclude=exclude) + self.steps = steps + nnodes = len(test) + # Universe = all possible edges, except for the ones that we no longer + # consider (because they're already in the training network) + self.num_universe = nnodes * (nnodes - 1) / 2 - len(exclude) + log.logger.debug("Constructed dataset '%s': " + "num_universe = %d" % (self.name, self.num_universe)) + + +class Comparison(object): + + def __init__(self): + self.datasets = [] + + def __iter__(self): + return iter(self.datasets) + + def register_dataset(self, dataset): + self.datasets.append(dataset) + + def register_datasets(self, datasets): + for d in datasets: + self.register_dataset(d) + + def run(self): + for d in self.datasets: + for predictorname, scoresheet in d.predictions: + for evaluation in ranked_evaluation(scoresheet, d.test, + n=d.steps, + universe=d.num_universe): + new_evaluation.send(sender=self, evaluation=evaluation, + dataset=d.name, predictor=predictorname) + datagroup_finished.send(sender=self, dataset=d.name, + predictor=predictorname) + dataset_finished.send(sender=self, dataset=d.name) + run_finished.send(sender=self) diff --git a/linkpred/evaluation/listeners.py b/linkpred/evaluation/listeners.py new file mode 100644 index 0000000..14bd9df --- /dev/null +++ b/linkpred/evaluation/listeners.py @@ -0,0 +1,225 @@ +import copy +import matplotlib.pyplot as plt + +from time import localtime, strftime + +from ..util import interpolate + +__all__ = ["Listener", "Plotter", "CachingListener", "FMaxListener", + "RecallPrecisionPlotter", "FScorePlotter", "ROCPlotter", + "PrecisionAtKListener", "MarkednessPlotter"] + + +class Listener(object): + + def on_new_evaluation(self, sender, **kwargs): + pass + + def on_datagroup_finished(self, sender, **kwargs): + pass + + def on_dataset_finished(self, sender, **kwargs): + pass + + def on_run_finished(self, sender, **kwargs): + pass + + +class CachingListener(Listener): + + def __init__(self): + self.cachefile = None + + def writeline(self, *args): + line = "\t".join(map(str, args)) + self.cachefile.write("%s\n" % line) + + def on_new_evaluation(self, sender, **kwargs): + evaluation, dataset, predictor = kwargs['evaluation'], \ + kwargs['dataset'], kwargs['predictor'] + tp, fp, fn, tn = evaluation.num_tp, evaluation.num_fp, \ + evaluation.num_fn, evaluation.num_tn + + if not self.cachefile: + fname = "%s-%s-cache.txt" % (dataset, predictor) + self.cachefile = open(fname, 'w') + # Header row + self.writeline('tp', 'fp', 'fn', 'tn') + self.writeline(tp, fp, fn, tn) + + def on_datagroup_finished(self, sender, **kwargs): + if not self.cachefile: + return + self.cachefile.close() + self.cachefile = None + + +class FMaxListener(Listener): + def __init__(self, name, beta=1): + self.beta = beta + self.reset_data() + self.fname = "%s-Fmax" % name + \ + strftime("_%Y-%m-%d_%H.%M.txt", localtime()) + + def reset_data(self): + self._f = [] + + def on_new_evaluation(self, sender, **kwargs): + evaluation = kwargs['evaluation'] + self._f.append(evaluation.f_score(self.beta)) + + def on_datagroup_finished(self, sender, **kwargs): + fmax = max(self._f) if self._f else 0 + self.reset_data() + + status = "%s\t%s\t%.4f\n" % ( + kwargs['dataset'], kwargs['predictor'], fmax) + + with open(self.fname, 'a') as f: + f.write(status) + print status + + +class PrecisionAtKListener(Listener): + def __init__(self, name, k=10, steps=1): + self.k = k + self.steps = steps + self.reset_data() + + self.fname = "%s-precision-at-%d" % (name, self.k) + \ + strftime("_%Y-%m-%d_%H.%M.txt", localtime()) + + def reset_data(self): + self.precision = 0.0 + self.count = 0 + + def on_new_evaluation(self, sender, **kwargs): + self.count += 1 + if self.count / self.steps == self.k: + self.precision = kwargs['evaluation'].precision() + + def on_datagroup_finished(self, sender, **kwargs): + status = "%s\t%s\t%.4f\n" % (kwargs['dataset'], + kwargs['predictor'], + self.precision) + + with open(self.fname, 'a') as f: + f.write(status) + print status + + self.reset_data() + + +generic_chart_looks = ['k-', 'k--', 'k.-', 'k:', + 'r-', 'r--', 'r.-', 'r:', + 'b-', 'b--', 'b.-', 'b:', + 'g-', 'g--', 'g.-', 'g:', + 'c-', 'c--', 'c.-', 'c:', + 'y-', 'y--', 'y.-', 'y:'] + + +class Plotter(Listener): + def __init__(self, name, xlabel="", ylabel="", filetype="pdf", chart_looks=[]): + self.name = name + self.filetype = filetype + self.chart_looks = chart_looks + self._charttype = "" + self._legend_props = {'prop': {'size': 'x-small'}} + self.fig = plt.figure() + self.fig.add_axes([0.1, 0.1, 0.8, 0.8], xlabel=xlabel, ylabel=ylabel) + self.reset_data() + + def reset_data(self): + self._x = [] + self._y = [] + + def add_line(self, dataset="", predictor="", default_look=generic_chart_looks): + label = self.build_label(dataset, predictor) + ax = self.fig.axes[0] + ax.plot(self._x, self._y, self.chart_look(default_look), label=label) + + def build_label(self, dataset="", predictor=""): + return predictor + + def chart_look(self, default): + if not self.chart_looks: + self.chart_looks = copy.copy(default) + return self.chart_looks.pop(0) + + def on_datagroup_finished(self, sender, **kwargs): + self.add_line(kwargs['dataset'], kwargs['predictor']) + self.reset_data() + + def on_run_finished(self, sender, **kwargs): + # Fix looks + for ax in self.fig.axes: + ax.legend(**self._legend_props) + + # Save to file + fname = "%s-%s" % (self.name, self._charttype) + \ + strftime("_%Y-%m-%d_%H.%M.", localtime()) + self.filetype + self.fig.savefig(fname) + + +class RecallPrecisionPlotter(Plotter): + def __init__(self, name, xlabel="Recall", ylabel="Precision", + interpolate=True, **kwargs): + Plotter.__init__(self, name, xlabel, ylabel, **kwargs) + self._charttype = "recall-precision" + self.interpolate = interpolate + + def reset_data(self): + # Make sure that we always start in the top-left corner + self._x = [0.] + self._y = [1.] + + def add_line(self, dataset="", predictor=""): + if self.interpolate: + self._y = interpolate(self._y) + Plotter.add_line(self, dataset, predictor) + + def on_new_evaluation(self, sender, **kwargs): + evaluation = kwargs['evaluation'] + + self._x.append(evaluation.recall()) + self._y.append(evaluation.precision()) + + +class FScorePlotter(Plotter): + def __init__(self, name, xlabel="#", ylabel="F-score", beta=1, steps=1, **kwargs): + Plotter.__init__(self, name, xlabel, ylabel, **kwargs) + self._charttype = "F-Score" + self.beta = beta + self.steps = steps + + def on_new_evaluation(self, sender, **kwargs): + evaluation = kwargs['evaluation'] + + self._x.append(self.steps * len(self._x)) + self._y.append(evaluation.f_score(self.beta)) + + +class ROCPlotter(Plotter): + def __init__(self, name, xlabel="False pos. rate", + ylabel="True pos. rate", **kwargs): + Plotter.__init__(self, name, xlabel, ylabel, **kwargs) + self._charttype = "ROC" + + def on_new_evaluation(self, sender, **kwargs): + evaluation = kwargs['evaluation'] + + self._x.append(evaluation.fallout()) + self._y.append(evaluation.recall()) + + +class MarkednessPlotter(Plotter): + def __init__(self, name, xlabel="Miss", ylabel="Precision", **kwargs): + Plotter.__init__(self, name, xlabel, ylabel, **kwargs) + self._charttype = "Markedness" + self._legend_props["loc"] = "upper left" + + def on_new_evaluation(self, sender, **kwargs): + evaluation = kwargs['evaluation'] + + self._x.append(evaluation.miss()) + self._y.append(evaluation.precision()) diff --git a/linkpred/evaluation/ranked.py b/linkpred/evaluation/ranked.py new file mode 100644 index 0000000..a7644b4 --- /dev/null +++ b/linkpred/evaluation/ranked.py @@ -0,0 +1,24 @@ +from .static import StaticEvaluation + + +def ranked_evaluation(retrieved, relevant, n=None, **kwargs): + """Generator for ranked evaluation of IR + + Arguments + --------- + retrieved : a Scoresheet + score sheet of ranked retrieved results + + relevant : a set + set of relevant results + + n : an integer + At each step, the next n items on the retrieved score sheet are + added to the set of retrieved items that are compared to the relevant + ones. + + """ + evaluation = StaticEvaluation(relevant=relevant, **kwargs) + for ret in retrieved.successive_sets(n=n): + evaluation.update_retrieved(ret) + yield evaluation diff --git a/linkpred/evaluation/scoresheet.py b/linkpred/evaluation/scoresheet.py new file mode 100644 index 0000000..a6b3726 --- /dev/null +++ b/linkpred/evaluation/scoresheet.py @@ -0,0 +1,181 @@ +import networkx as nx + +from collections import defaultdict +from ..util import log + +__all__ = ["Pair", "BaseScoresheet", "Scoresheet"] + + +def _boundaries(start, steps, threshold, successive): + for i in range(start, threshold, steps): + begin = i if successive else start + end = i + steps + yield begin, end + + +class BaseScoresheet(defaultdict): + """Score sheet for evaluation of IR and similar + + This is a simple dict-like object, whose values are typically numeric + (floats). It adds the methods `sets`, `successive_sets` and `top`. + + Example + ------- + >>> data = {('a', 'b'): 0.8, ('b', 'c'): 0.5, ('c', 'a'): 0.2} + >>> sheet = Scoresheet(data) + >>> for s in sheet.sets(steps=2): + ... print s + + """ + def __init__(self, data=None, n=100): + defaultdict.__init__(self, float) + if data: + self.update(self.process_data(data)) + self.n = n + + def __setitem__(self, key, val): + dict.__setitem__(self, key, float(val)) + + def process_data(self, data): + """Can be overridden by child classes""" + return data + + def sets(self, n=None, threshold=None, successive=False, as_dict=False): + """Return sets of items on the scoresheet in decreasing order + + Arguments + --------- + + start : int + Where to start for first set + + n : int + Number of items per set + + threshold : int + Maximum number of items to return (in total) + Note that this is treated as a size hint, rather than a strict limit. + + successive : True|False + if True, return successive sets; if False, return incremental sets + + """ + n = n or self.n + threshold = threshold or len(self) + log.logger.debug("Called Scoresheet.sets(): n=%d, " + "threshold=%d" % (n, threshold)) + + # Sort first by score, then by key. This way, we always get the same + # ranking, even in case of ties. + # We use the tmp structure because it is much faster than + # itemgetter(1, 0). + tmp = ((score, key) for key, score in self.iteritems()) + if as_dict: + ranked_data = [(key, score) for score, + key in sorted(tmp, reverse=True)] + else: + ranked_data = [key for _, key in sorted(tmp, reverse=True)] + size = len(ranked_data) + + for begin, end in _boundaries(0, n, threshold, successive): + if begin >= size: + raise StopIteration + if as_dict: + yield dict(ranked_data[begin:end]) + else: + yield set(ranked_data[begin:end]) + + def successive_sets(self, n=None, threshold=None): + return self.sets(n, threshold, True) + + def top(self, n=10): + top_n = self.sets(n=n).next() + return {k: self[k] for k in top_n} + + +class Pair(object): + """An unsorted pair of things. + + We could probably also use frozenset for this, but a Pair class opens + possibilities for the future, such as extensions to 'directed' pairs + (where the order is important) or to self-loops (where the two elements + are the same). + + Example + ------- + >>> t = ('a', 'b') + >>> Pair(t) == Pair(*t) == Pair('b', 'a') + True + + """ + def __init__(self, *args): + if len(args) == 1: + key = args[0] + if isinstance(key, Pair): + a, b = key.elements + elif isinstance(key, tuple) and len(key) == 2: + a, b = key + else: + raise TypeError("Key '%s' is not a Pair or tuple." % (key)) + pass + elif len(args) == 2: + a, b = args + else: + raise TypeError( + "__init__() takes 1 or 2 arguments in addition to self") + # For link prediction, a and b are two different nodes + assert a != b, "Predicted link (%s, %s) is a self-loop!" % (a, b) + self.elements = (a, b) if a > b else (b, a) + + def __eq__(self, other): + return self.elements == other.elements + + def __ne__(self, other): + return self.elements != other.elements + + def __lt__(self, other): + return self.elements < other.elements + + def __gt__(self, other): + return self.elements > other.elements + + def __getitem__(self, idx): + return self.elements[idx] + + def __hash__(self): + return hash(self.elements) + + def __str__(self): + a, b = self.elements + return "Pair(%s, %s)" % (str(a), str(b)) + + def __repr__(self): + return str(self) + + def __iter__(self): + return iter(self.elements) + + +class Scoresheet(BaseScoresheet): + """Scoresheet for link prediction + + Scoresheet's keys are always Pairs. + + """ + def __getitem__(self, key): + return BaseScoresheet.__getitem__(self, Pair(key)) + + def __setitem__(self, key, val): + BaseScoresheet.__setitem__(self, Pair(key), float(val)) + + def __delitem__(self, key): + return dict.__delitem__(self, Pair(key)) + + def process_data(self, data, weight='weight'): + if isinstance(data, dict): + return {Pair(k): float(v) for k, v in data.iteritems()} + if isinstance(data, nx.Graph): + return {Pair(u, v): float(d[weight]) for u, v, d + in data.edges(data=True)} + # We assume that data is some sort of iterable, like a list or tuple + return {Pair(k): float(v) for k, v in data} diff --git a/linkpred/evaluation/signals.py b/linkpred/evaluation/signals.py new file mode 100644 index 0000000..c324b4a --- /dev/null +++ b/linkpred/evaluation/signals.py @@ -0,0 +1,10 @@ +import dispatch + +__all__ = ["new_evaluation", "datagroup_finished",\ + "dataset_finished", "run_finished"] + +new_evaluation = dispatch.Signal(providing_args=["evaluation", "dataset",\ + "predictor"]) +datagroup_finished = dispatch.Signal(providing_args=["dataset", "predictor"]) +dataset_finished = dispatch.Signal(providing_args=["dataset"]) +run_finished = dispatch.Signal() diff --git a/linkpred/evaluation/static.py b/linkpred/evaluation/static.py new file mode 100644 index 0000000..9345fc7 --- /dev/null +++ b/linkpred/evaluation/static.py @@ -0,0 +1,187 @@ +from ..util import log + + +class StaticEvaluation(object): + """ + Static evaluation of IR + """ + def __init__(self, retrieved=[], relevant=[], universe=None): + """ + Initialize IR evaluation. + + We determine the following table: + + +--------------+---------------+ + | tp | fp | + | ret & rel | ret & ~rel | + +--------------+---------------+ + | fn | tn | + | ~ret & rel | ~ret & ~rel | + +--------------+---------------+ + + Arguments + --------- + retrieved : a list or set + iterable of the retrieved items + + relevant : a list or set + iterable of the relevant items + + universe : a list or set, an int or None + If universe is an iterable, it is interpreted as the set of all items + in the system. + If universe is an int, it is interpreted as the *number* of items in + the system. This allows for fewer checks but is more memory-efficient. + If universe is None, it is supposed to be unknown. This still allows for + some measures, including precision and recall, to be calculated. + + """ + retrieved = set(retrieved) + relevant = set(relevant) + + self.fp = retrieved - relevant + self.fn = relevant - retrieved + self.tp = retrieved & relevant + if universe is None: + self.tn = None + self.num_universe = -1 + elif isinstance(universe, int): + self.tn = None + self.num_universe = universe + if len(retrieved) > self.num_universe: + raise ValueError("Retrieved cannot be larger than universe.") + if len(relevant) > self.num_universe: + raise ValueError("Retrieved cannot be larger than universe.") + else: + universe = set(universe) + if not (retrieved <= universe and relevant <= universe): + raise ValueError("Retrieved and relevant should be " + "subsets of universe.") + self.tn = universe - retrieved - relevant + del universe + self.update_counts() + + def update_counts(self): + self.num_fp = len(self.fp) + self.num_fn = len(self.fn) + self.num_tp = len(self.tp) + if self.tn is not None: + self.num_tn = len(self.tn) + elif self.num_universe == -1: + self.num_tn = -1 + else: + self.num_tn = self.num_universe - self.num_fp \ + - self.num_fn - self.num_tp + assert self.num_tn >= 0 + + def update_retrieved(self, new): + new = set(new) + + if not (new.isdisjoint(self.tp) and new.isdisjoint(self.fp)): + raise ValueError("One or more elements in `new` have " + "already been retrieved.") + + relevant_new = new & self.fn + nonrelevant_new = new - relevant_new + + self.tp |= relevant_new + self.fp |= nonrelevant_new + if self.tn: + if not new <= self.fn | self.tn: + raise ValueError("Newly retrieved items should be a subset " + "of currently unretrieved items.") + self.tn -= nonrelevant_new + self.fn -= relevant_new + self.update_counts() + + def precision(self): + try: + return float(self.num_tp) / (self.num_tp + self.num_fp) + except ZeroDivisionError: + log.logger.warning("Division by 0 in calculating precision: " + "tp = %d, fp = %d, fn = %d, tn = %d" % + (self.num_tp, self.num_fp, self.num_tn, self.num_tn)) + return 0.0 + + def recall(self): + try: + return float(self.num_tp) / (self.num_tp + self.num_fn) + except ZeroDivisionError: + log.logger.warning("Division by 0 in calculating recall: " + "tp = %d, fp = %d, fn = %d, tn = %d" % + (self.num_tp, self.num_fp, self.num_tn, self.num_tn)) + return 0.0 + + def fallout(self): + if self.num_tn == -1: + raise ValueError( + "Cannot determine fallout if universe is undefined") + try: + return float(self.num_fp) / (self.num_fp + self.num_tn) + except ZeroDivisionError: + log.logger.warning("Division by 0 in calculating fallout: " + "tp = %d, fp = %d, fn = %d, tn = %d" % + (self.num_tp, self.num_fp, self.num_tn, self.num_tn)) + return 0.0 + + def miss(self): + if self.num_tn == -1: + raise ValueError("Cannot determine miss if universe is undefined") + try: + return float(self.num_fn) / (self.num_fn + self.num_tn) + except ZeroDivisionError: + log.logger.warning("Division by 0 in calculating miss: " + "tp = %d, fp = %d, fn = %d, tn = %d" % + (self.num_tp, self.num_fp, self.num_tn, self.num_tn)) + return 0.0 + + def accuracy(self): + """Compute accuracy = |correct| / |universe| + + Not appropriate for IR, since over 99.9% is nonrelevant. A system that + labels everything as nonrelevant, would still have high accuracy. + """ + if self.num_tn == -1: + raise ValueError( + "Cannot determine accuracy if universe is undefined") + try: + return float(self.num_tp + self.num_tn) / \ + (self.num_tp + self.num_fp + self.num_tn + self.num_fn) + except ZeroDivisionError: + log.logger.warning("Division by 0 in calculating accuracy: " + "tp = %d, fp = %d, fn = %d, tn = %d" % + (self.num_tp, self.num_fp, self.num_tn, self.num_tn)) + return 0.0 + + def f_score(self, beta=1): + """Compute F-measure or F-score. + + F is the weighted harmonic mean of recall R and precision P: + F = 2PR / (P + R) + In this case, R and P are evenly weighted. More generally: + F = (1 + b^2)PR / (b^2 * P + R) + If beta = 2, R is weighted twice as much as P. + If beta = 0.5, R is weighted half as much as P. + + """ + p = self.precision() + r = self.recall() + beta_squared = beta ** 2 + try: + return float((1 + beta_squared) * p * r) / (beta_squared * p + r) + except ZeroDivisionError: + return 0.0 + + def generality(self): + """Compute generality = |relevant| / |universe|""" + if self.num_tn == -1: + raise ValueError( + "Cannot determine generality if universe is undefined") + try: + return float(self.num_tp + self.num_fn) / \ + (self.num_tp + self.num_fp + self.num_tn + self.num_fn) + except ZeroDivisionError: + log.logger.warning("Division by 0 in calculating generality: " + "tp = %d, fp = %d, fn = %d, tn = %d" % + (self.num_tp, self.num_fp, self.num_tn, self.num_tn)) + return 0.0 diff --git a/linkpred/evaluation/tests/test_comparison.py b/linkpred/evaluation/tests/test_comparison.py new file mode 100644 index 0000000..b9966f8 --- /dev/null +++ b/linkpred/evaluation/tests/test_comparison.py @@ -0,0 +1,24 @@ +import networkx as nx +from nose.tools import assert_equal + +from linkpred.evaluation.comparison import DataSet +from linkpred.result import Result + + +def test_dataset_init(): + name = "test" + predictions = {("a", "b"): 1, ("b", "c"): 2} + test_network = nx.Graph() + test_network.add_edges_from([("a", "b"), ("b", "c"), ("c", "d"), ("c", "e")]) + test = Result(test_network, eligible=None) + steps = 5 + + d = DataSet(name, predictions, test, steps=steps) + assert_equal(d.name, name) + assert_equal(d.predictions, predictions) + assert_equal(d.steps, steps) + assert_equal(d.num_universe, 10) + + d = DataSet(name, predictions, test, exclude=set([("c", "d"), ("d", "e")]), + steps=steps) + assert_equal(d.num_universe, 8) diff --git a/linkpred/evaluation/tests/test_scoresheet.py b/linkpred/evaluation/tests/test_scoresheet.py new file mode 100644 index 0000000..5f1ec2a --- /dev/null +++ b/linkpred/evaluation/tests/test_scoresheet.py @@ -0,0 +1,99 @@ +import networkx as nx +from nose.tools import assert_dict_equal, assert_equal, assert_less, raises + +from linkpred.evaluation.scoresheet import BaseScoresheet, Pair, Scoresheet + + +class TestBaseScoresheet: + def setup(self): + self.n = 3 + self.scoresheet = BaseScoresheet( + zip("abcdefghijklmnopqrstuvwx", range(24)), n=self.n) + + def test_sets(self): + for i, s in enumerate(self.scoresheet.sets(), start=1): + assert_equal(len(s), i * self.n) + for i, s in enumerate(self.scoresheet.successive_sets()): + assert_equal(len(s), self.n) + + def test_sets_with_n(self): + n = 8 + for i, s in enumerate(self.scoresheet.sets(n=n), start=1): + assert_equal(len(s), i * n) + for s in self.scoresheet.successive_sets(n=n): + assert_equal(len(s), n) + + def test_sets_with_even_threshold(self): + threshold = 12 + for i, s in enumerate(self.scoresheet.sets(threshold=threshold), start=1): + assert_equal(len(s), i * self.n) + for s in self.scoresheet.successive_sets(threshold=threshold): + assert_equal(len(s), self.n) + + def test_with_too_large_threshold(self): + threshold = 25 + for s in self.scoresheet.sets(threshold=threshold): + assert_less(len(s), threshold) + for s in self.scoresheet.successive_sets(threshold=threshold): + assert_equal(len(s), self.n) + + def test_sets_with_uneven_threshold(self): + """ + If the threshold does not nicely fit a 'boundary', only the last set + should be affected. + """ + threshold = 10 + + result = list(enumerate(self.scoresheet.sets(threshold=threshold), start=1)) + for i, s in result: + assert_equal(len(s), i * self.n) + + result = list(self.scoresheet.successive_sets(threshold=threshold)) + for s in result: + assert_equal(len(s), self.n) + + def test_top(self): + top = self.scoresheet.top() + assert_dict_equal(top, dict(zip("opqrstuvwx", range(14, 24)))) + + top = self.scoresheet.top(2) + assert_dict_equal(top, dict(zip("wx", range(22, 24)))) + + top = self.scoresheet.top(100) + assert_equal(len(top), 24) + + +def test_pair(): + t = ('a', 'b') + pair = Pair(t) + assert_equal(pair, Pair(*t)) + assert_equal(pair, Pair('b', 'a')) + assert_equal(str(pair), "Pair(b, a)") + + +@raises(AssertionError) +def test_pair_identical_elements(): + Pair('a', 'a') + + +def test_scoresheet(): + sheet = Scoresheet() + t = ('a', 'b') + sheet[t] = 5 + assert_equal(len(sheet), 1) + assert_equal(sheet.items(), [(Pair('a', 'b'), 5.0)]) + assert_equal(sheet[t], 5.0) + del sheet[t] + assert_equal(len(sheet), 0) + + +def test_scoresheet_process_data(): + t = ('a', 'b') + d = {t: 5} + G = nx.Graph() + G.add_edge(*t, weight=5) + s = [(t, 5)] + + for x in (d, G, s): + sheet = Scoresheet(x) + assert_equal(sheet[t], 5.0) diff --git a/linkpred/evaluation/tests/test_static.py b/linkpred/evaluation/tests/test_static.py new file mode 100644 index 0000000..a6b9d84 --- /dev/null +++ b/linkpred/evaluation/tests/test_static.py @@ -0,0 +1,145 @@ +from nose.tools import * + +from linkpred.evaluation.static import StaticEvaluation + +class TestStaticEvaluation: + def setup(self): + self.ret = range(5) + self.rel = [3, 4, 5, 6] + self.num_universe = 20 + self.universe = range(self.num_universe) + + def test_init(self): + e = StaticEvaluation(self.ret, self.rel, self.universe) + assert_equal(len(e.tp), 2) + assert_equal(len(e.fp), 3) + assert_equal(len(e.tn), 13) + assert_equal(len(e.fn), 2) + + e_no_universe = StaticEvaluation(self.ret, self.rel) + assert_equal(len(e.tp), len(e_no_universe.tp)) + assert_equal(len(e.fp), len(e_no_universe.fp)) + assert_equal(len(e.fn), len(e_no_universe.fn)) + assert_equal(e_no_universe.tn, None) + + e_num_universe = StaticEvaluation(self.ret, self.rel, self.num_universe) + assert_equal(len(e.tp), 2) + assert_equal(len(e.fp), 3) + assert_equal(len(e.fn), 2) + assert_equal(len(e.tp), e.num_tp) + assert_equal(len(e.fp), e.num_fp) + assert_equal(len(e.fn), e.num_fn) + assert_equal(e.num_tn, 13) + + def test_update_retrieved(self): + e = StaticEvaluation(self.ret, self.rel, self.universe) + e.update_retrieved([6, 7]) + assert_equal(len(e.tp), 3) + assert_equal(len(e.fp), 4) + assert_equal(len(e.tn), 12) + assert_equal(len(e.fn), 1) + + assert_raises(ValueError, e.update_retrieved, [1]) # fp + assert_raises(ValueError, e.update_retrieved, [3]) # tp + assert_raises(ValueError, e.update_retrieved, ['a']) + + def test_update_retrieved_num_universe(self): + e = StaticEvaluation(self.ret, self.rel, self.num_universe) + e.update_retrieved([6, 7]) + assert_equal(len(e.tp), 3) + assert_equal(len(e.fp), 4) + assert_equal(len(e.fn), 1) + assert_equal(e.num_tp, 3) + assert_equal(e.num_fp, 4) + assert_equal(e.num_tn, 12) + assert_equal(e.num_fn, 1) + + assert_raises(ValueError, e.update_retrieved, [1]) # fp + assert_raises(ValueError, e.update_retrieved, [3]) # tp + + def test_update_retrieved_full(self): + e = StaticEvaluation(relevant=range(5), universe=20) + e.update_retrieved(range(10)) + e.update_retrieved(range(10, 20)) + assert_equal(e.num_tp, 5) + assert_equal(e.num_fp, 15) + assert_equal(e.num_fn, 0) + assert_equal(e.num_tn, 0) + + @raises(ValueError) + def test_ret_no_universe_subset(self): + e = StaticEvaluation([1, 2, 'a'], [2, 3], range(10)) + + @raises(ValueError) + def test_rel_no_universe_subset(self): + e = StaticEvaluation([1, 2], [2, 3, 'a'], range(10)) + + @raises(ValueError) + def test_ret_larger_than_universe(self): + e = StaticEvaluation(range(11), [2, 3], 10) + + @raises(ValueError) + def test_rel_larger_than_universe(self): + e = StaticEvaluation([1, 2], range(11), 10) + + def test_measures(self): + e = StaticEvaluation(self.ret, self.rel, self.universe) + assert_equal(e.precision(), float(2) / 5) + assert_equal(e.recall(), float(2) / 4) + assert_equal(e.fallout(), float(3) / 16) + assert_equal(e.miss(), float(2) / 15) + assert_equal(e.accuracy(), float(15) / 20) + assert_equal(e.generality(), float(4) / 20) + + e = StaticEvaluation(self.ret, self.rel) + assert_equal(e.precision(), float(2) / 5) + assert_equal(e.recall(), float(2) / 4) + assert_raises(ValueError, e.fallout) + assert_raises(ValueError, e.miss) + assert_raises(ValueError, e.accuracy) + assert_raises(ValueError, e.generality) + + e = StaticEvaluation(self.ret, self.rel, self.num_universe) + assert_equal(e.precision(), float(2) / 5) + assert_equal(e.recall(), float(2) / 4) + assert_equal(e.fallout(), float(3) / 16) + assert_equal(e.miss(), float(2) / 15) + assert_equal(e.accuracy(), float(15) / 20) + assert_equal(e.generality(), float(4) / 20) + + def test_measures_with_zero_universe(self): + e = StaticEvaluation([], [], []) + assert_equal(e.precision(), 0.) + assert_equal(e.recall(), 0.) + assert_equal(e.f_score(), 0.) + assert_equal(e.fallout(), 0.) + assert_equal(e.miss(), 0.) + assert_equal(e.accuracy(), 0.) + assert_equal(e.generality(), 0.) + + def test_measures_with_zero_num_universe(self): + e = StaticEvaluation([], [], 0) + assert_equal(e.precision(), 0.) + assert_equal(e.recall(), 0.) + assert_equal(e.f_score(), 0.) + assert_equal(e.fallout(), 0.) + assert_equal(e.miss(), 0.) + assert_equal(e.accuracy(), 0.) + assert_equal(e.generality(), 0.) + + def test_measures_with_zero_no_universe(self): + e = StaticEvaluation([], []) + assert_equal(e.precision(), 0.) + assert_equal(e.recall(), 0.) + assert_equal(e.f_score(), 0.) + assert_raises(ValueError, e.fallout) + assert_raises(ValueError, e.miss) + assert_raises(ValueError, e.accuracy) + assert_raises(ValueError, e.generality) + + def test_f_score(self): + e = StaticEvaluation(self.ret, self.rel) + assert_almost_equal(e.f_score(), 4. / 9.) + # $F_\beta = \frac{\beta^2 + 1 |rel \cap ret|}{\beta^2 |rel| + |ret|}$ + assert_almost_equal(e.f_score(0.5), 1.25 * 2. / 6.) + assert_almost_equal(e.f_score(2), 10. / 21.) diff --git a/linkpred/network/__init__.py b/linkpred/network/__init__.py new file mode 100644 index 0000000..91e707f --- /dev/null +++ b/linkpred/network/__init__.py @@ -0,0 +1,5 @@ +from .addremove import * +from .algorithms import * +from .community import * +from .misc import * +from .pajek import * diff --git a/linkpred/network/addremove.py b/linkpred/network/addremove.py new file mode 100644 index 0000000..2a41e08 --- /dev/null +++ b/linkpred/network/addremove.py @@ -0,0 +1,38 @@ +from random import sample +from ..util import all_pairs, log + +__all__ = ['add_random_edges', 'remove_random_edges', + 'add_remove_random_edges'] + + +def add_random_edges(G, pct): + edges = G.edges() + m = len(edges) + to_add = int(m * pct) + log.logger.debug("Will add %d edges to %d (%f)" % (to_add, m, pct)) + + new_edges = set(all_pairs(G.nodes())) - set(edges) + G.add_edges_from(sample(new_edges, to_add), weight=1) + + +def remove_random_edges(G, pct): + edges = G.edges() + m = len(edges) + to_remove = int(m * pct) + + log.logger.debug("Will remove %d edges of %d (%f)" % (to_remove, m, pct)) + G.remove_edges_from(sample(edges, to_remove)) + + +def add_remove_random_edges(G, pct_add, pct_remove): + edges = G.edges() + m = len(edges) + to_add = int(m * pct_add) + to_remove = int(m * pct_remove) + log.logger.debug("Will add %d (%f) edges to and remove" + "%d (%f) edges from %d" % + (to_add, pct_add, to_remove, pct_remove, m)) + + new_edges = set(all_pairs(G.nodes())) - set(edges) + G.remove_edges_from(sample(edges, to_remove)) + G.add_edges_from(sample(new_edges, to_add)) diff --git a/linkpred/network/algorithms.py b/linkpred/network/algorithms.py new file mode 100644 index 0000000..79b2844 --- /dev/null +++ b/linkpred/network/algorithms.py @@ -0,0 +1,75 @@ +import networkx +import numpy + +from ..util import log + +__all__ = ["rooted_pagerank", "simrank"] + + +def rooted_pagerank(G, root, alpha=0.85, beta=0, weight='weight'): + """Return the rooted PageRank of all nodes with respect to node `root`. + + Parameters + ---------- + + G : a networkx.(Di)Graph + network to compute PR on + + root : a node from the network + the node that will be the starting point of all random walks + + alpha : float + PageRank probability that we will advance to a neighbour of the + current node in a random walk + + beta : float or int + Normally, we return to the root node with probability 1 - alpha. + With this parameter, we can also advance to a random other node in the + network with probability beta. Thus, we get back to the root node with + probability 1 - alpha - beta. This is off (0) by default. + + weight : string or None + The edge attribute that holds the numerical value used for + the edge weight. If None then treat as unweighted. + + """ + personalization = dict.fromkeys(G, beta) + personalization[root] = 1 - beta + + return networkx.pagerank_scipy(G, alpha, personalization, weight=weight) + + +def simrank(G, nodelist=None, c=0.8, num_iterations=10, weight='weight'): + r"""Calculate SimRank matrix for nodes in nodelist + + SimRank is defined as + + sim(u, v) = \frac{c}{|\Gamma(u)| |\Gamma(v)|} \sum_{p \in \Gamma(u)} + \sum_{q \in \Gamma(v)} sim(p, q) + + """ + n = len(G) + M = raw_google_matrix(G, nodelist=nodelist, weight=weight) + sim = numpy.identity(n, dtype=numpy.float32) + for i in range(num_iterations): + log.logger.debug("Starting SimRank iteration %d" % i) + temp = c * M.T * sim * M + sim = temp + numpy.identity(n) - numpy.diag(numpy.diag(temp)) + return sim + + +def raw_google_matrix(G, nodelist=None, weight='weight'): + """Calculate the raw Google matrix (stochastic without teleportation)""" + M = networkx.to_numpy_matrix(G, nodelist=nodelist, dtype=numpy.float32, + weight=weight) + n, m = M.shape # should be square + assert n == m and n > 0 + # Find 'dangling' nodes, i.e. nodes whose row's sum = 0 + dangling = numpy.where(M.sum(axis=1) == 0) + # add constant to dangling nodes' row + for d in dangling[0]: + M[d] = 1.0 / n + # Normalize. We now have the 'raw' Google matrix (cf. example on p. 11 of + # Langville & Meyer (2006)). + M = M / M.sum(axis=1) + return M diff --git a/linkpred/network/community.py b/linkpred/network/community.py new file mode 100644 index 0000000..125a797 --- /dev/null +++ b/linkpred/network/community.py @@ -0,0 +1,523 @@ +""" +This module implements community detection. +""" +__all__ = ["partition_at_level", "modularity", "best_partition", + "generate_dendogram", "induced_graph"] +__author__ = """Thomas Aynaud (thomas.aynaud@lip6.fr)""" +# Copyright (C) 2009 by +# Thomas Aynaud +# All rights reserved. +# BSD license. + +__PASS_MAX = -1 +__MIN = 0.0000001 + +import networkx as nx +import sys +import types +import array + + +def partition_at_level(dendogram, level): + """Return the partition of the nodes at the given level + + A dendogram is a tree and each level is a partition of the graph nodes. + Level 0 is the first partition, which contains the smallest communities, and the best is len(dendogram) - 1. + The higher the level is, the bigger are the communities + + Parameters + ---------- + dendogram : list of dict + a list of partitions, ie dictionnaries where keys of the i+1 are the values of the i. + level : int + the level which belongs to [0..len(dendogram)-1] + + Returns + ------- + partition : dictionnary + A dictionary where keys are the nodes and the values are the set it belongs to + + Raises + ------ + KeyError + If the dendogram is not well formed or the level is too high + + See Also + -------- + best_partition which directly combines partition_at_level and generate_dendogram to obtain the partition of highest modularity + + Examples + -------- + >>> G=nx.erdos_renyi_graph(100, 0.01) + >>> dendo = generate_dendogram(G) + >>> for level in range(len(dendo) - 1) : + >>> print "partition at level", level, "is", partition_at_level(dendo, level) + """ + partition = dendogram[0].copy() + for index in range(1, level + 1): + for node, community in partition.iteritems(): + partition[node] = dendogram[index][community] + return partition + + +def modularity(partition, graph): + """Compute the modularity of a partition of a graph + + Parameters + ---------- + partition : dict + the partition of the nodes, i.e a dictionary where keys are their nodes and values the communities + graph : networkx.Graph + the networkx graph which is decomposed + + Returns + ------- + modularity : float + The modularity + + Raises + ------ + KeyError + If the partition is not a partition of all graph nodes + ValueError + If the graph has no link + TypeError + If graph is not a networkx.Graph + + References + ---------- + .. 1. Newman, M.E.J. & Girvan, M. Finding and evaluating community structure in networks. Physical Review E 69, 26113(2004). + + Examples + -------- + >>> G=nx.erdos_renyi_graph(100, 0.01) + >>> part = best_partition(G) + >>> modularity(part, G) + """ + if not isinstance(graph, nx.Graph): + raise TypeError("Bad graph type, use only non directed graph") + + inc = {} + deg = {} + links = graph.size(weight='weight') + if links == 0: + raise ValueError("A graph without link has an undefined modularity") + + for node in graph: + com = partition[node] + deg[com] = deg.get(com, 0.) + graph.degree(node, weight='weight') + for neighbor, datas in graph[node].iteritems(): + weight = datas.get("weight", 1) + if partition[neighbor] == com: + if neighbor == node: + inc[com] = inc.get(com, 0.) + float(weight) + else: + inc[com] = inc.get(com, 0.) + float(weight) / 2. + + res = 0. + for com in set(partition.values()): + res += ( + inc.get(com, 0.) / links) - (deg.get(com, 0.) / (2. * links)) ** 2 + return res + + +def best_partition(graph, partition=None): + """Compute the partition of the graph nodes which maximises the modularity + (or try..) using the Louvain heuristices + + This is the partition of highest modularity, i.e. the highest partition of the dendogram + generated by the Louvain algorithm. + + Parameters + ---------- + graph : networkx.Graph + the networkx graph which is decomposed + partition : dict, optionnal + the algorithm will start using this partition of the nodes. It's a dictionary where keys are their nodes and values the communities + + Returns + ------- + partition : dictionnary + The partition, with communities numbered from 0 to number of communities + + Raises + ------ + NetworkXError + If the graph is not Eulerian. + + See Also + -------- + generate_dendogram to obtain all the decompositions levels + + Notes + ----- + Uses Louvain algorithm + + References + ---------- + .. 1. Blondel, V.D. et al. Fast unfolding of communities in large networks. J. Stat. Mech 10008, 1-12(2008). + + Examples + -------- + >>> #Basic usage + >>> G=nx.erdos_renyi_graph(100, 0.01) + >>> part = best_partition(G) + + >>> #other example to display a graph with its community : + >>> #better with karate_graph() as defined in networkx examples + >>> #erdos renyi don't have true community structure + >>> G = nx.erdos_renyi_graph(30, 0.05) + >>> #first compute the best partition + >>> partition = community.best_partition(G) + >>> #drawing + >>> size = float(len(set(partition.values()))) + >>> pos = nx.spring_layout(G) + >>> count = 0. + >>> for com in set(partition.values()) : + >>> count = count + 1. + >>> list_nodes = [nodes for nodes in partition.keys() + >>> if partition[nodes] == com] + >>> nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20, + node_color = str(count / size)) + >>> nx.draw_networkx_edges(G,pos, alpha=0.5) + >>> plt.show() + """ + dendo = generate_dendogram(graph, partition) + return partition_at_level(dendo, len(dendo) - 1) + + +def generate_dendogram(graph, part_init=None): + """Find communities in the graph and return the associated dendogram + + A dendogram is a tree and each level is a partition of the graph nodes. Level 0 is the first partition, which contains the smallest communities, and the best is len(dendogram) - 1. The higher the level is, the bigger are the communities + + + Parameters + ---------- + graph : networkx.Graph + the networkx graph which will be decomposed + part_init : dict, optionnal + the algorithm will start using this partition of the nodes. It's a dictionary where keys are their nodes and values the communities + + Returns + ------- + dendogram : list of dictionaries + a list of partitions, ie dictionnaries where keys of the i+1 are the values of the i. and where keys of the first are the nodes of graph + + Raises + ------ + TypeError + If the graph is not a networkx.Graph + + See Also + -------- + best_partition + + Notes + ----- + Uses Louvain algorithm + + References + ---------- + .. 1. Blondel, V.D. et al. Fast unfolding of communities in large networks. J. Stat. Mech 10008, 1-12(2008). + + Examples + -------- + >>> G=nx.erdos_renyi_graph(100, 0.01) + >>> dendo = generate_dendogram(G) + >>> for level in range(len(dendo) - 1) : + >>> print "partition at level", level, "is", partition_at_level(dendo, level) + """ + if not isinstance(graph, nx.Graph): + raise TypeError("Bad graph type, use only non directed graph") + current_graph = graph.copy() + status = Status() + status.init(current_graph, part_init) + mod = __modularity(status) + status_list = list() + __one_level(current_graph, status) + new_mod = __modularity(status) + partition = __renumber(status.node2com) + status_list.append(partition) + mod = new_mod + current_graph = induced_graph(partition, current_graph) + status.init(current_graph) + + while True: + __one_level(current_graph, status) + new_mod = __modularity(status) + if new_mod - mod < __MIN: + break + partition = __renumber(status.node2com) + status_list.append(partition) + mod = new_mod + current_graph = induced_graph(partition, current_graph) + status.init(current_graph) + return status_list[:] + + +def induced_graph(partition, graph): + """Produce the graph where nodes are the communities + + there is a link of weight w between communities if the sum of the weights of the links between their elements is w + + Parameters + ---------- + partition : dict + a dictionary where keys are graph nodes and values the part the node belongs to + graph : networkx.Graph + the initial graph + + Returns + ------- + g : networkx.Graph + a networkx graph where nodes are the parts + + Examples + -------- + >>> n = 5 + >>> g = nx.complete_graph(2*n) + >>> part = {} + >>> for node in g.nodes() : + >>> part[node] = node % 2 + >>> ind = induced_graph(part, g) + >>> goal = nx.Graph() + >>> goal.add_weighted_edges_from([(0,1,n*n),(0,0,n*(n-1)/2), (1, 1, n*(n-1)/2)]) + >>> nx.is_isomorphic(int, goal) + True + """ + ret = nx.Graph() + ret.add_nodes_from(partition.values()) + + for node1, node2, datas in graph.edges_iter(data=True): + weight = datas.get("weight", 1) + com1 = partition[node1] + com2 = partition[node2] + w_prec = ret.get_edge_data(com1, com2, {"weight": 0}).get("weight", 1) + ret.add_edge(com1, com2, weight=w_prec + weight) + + return ret + + +def __renumber(dictionary): + """Renumber the values of the dictionary from 0 to n + """ + count = 0 + ret = dictionary.copy() + new_values = {} + + for key in dictionary.keys(): + value = dictionary[key] + new_value = new_values.get(value, -1) + if new_value == -1: + new_values[value] = count + new_value = count + count = count + 1 + ret[key] = new_value + + return ret + + +def __load_binary(data): + """Load binary graph as used by the cpp implementation of this algorithm + """ + if isinstance(data, types.StringType): + data = open(data, "rb") + + reader = array.array("I") + reader.fromfile(data, 1) + num_nodes = reader.pop() + reader = array.array("I") + reader.fromfile(data, num_nodes) + cum_deg = reader.tolist() + num_links = reader.pop() + reader = array.array("I") + reader.fromfile(data, num_links) + links = reader.tolist() + graph = nx.Graph() + graph.add_nodes_from(range(num_nodes)) + prec_deg = 0 + + for index in range(num_nodes): + last_deg = cum_deg[index] + neighbors = links[prec_deg:last_deg] + graph.add_edges_from([(index, int(neigh)) for neigh in neighbors]) + prec_deg = last_deg + + return graph + + +def __one_level(graph, status): + """Compute one level of communities + """ + modif = True + nb_pass_done = 0 + cur_mod = __modularity(status) + new_mod = cur_mod + + while modif and nb_pass_done != __PASS_MAX: + cur_mod = new_mod + modif = False + nb_pass_done += 1 + + for node in graph.nodes(): + com_node = status.node2com[node] + degc_totw = status.gdegrees.get( + node, 0.) / (status.total_weight * 2.) + neigh_communities = __neighcom(node, graph, status) + __remove(node, com_node, + neigh_communities.get(com_node, 0.), status) + best_com = com_node + best_increase = 0 + for com, dnc in neigh_communities.iteritems(): + incr = dnc - status.degrees.get(com, 0.) * degc_totw + if incr > best_increase: + best_increase = incr + best_com = com + __insert(node, best_com, + neigh_communities.get(best_com, 0.), status) + if best_com != com_node: + modif = True + new_mod = __modularity(status) + if new_mod - cur_mod < __MIN: + break + + +class Status: + """ + To handle several data in one struct. + + Could be replaced by named tuple, but don't want to depend on python 2.6 + """ + node2com = {} + total_weight = 0 + internals = {} + degrees = {} + gdegrees = {} + + def __init__(self): + self.node2com = {} + self.total_weight = 0 + self.degrees = {} + self.gdegrees = {} + self.internals = {} + self.loops = {} + + def __str__(self): + return ("node2com : " + str(self.node2com) + " degrees : " + + str(self.degrees) + " internals : " + str(self.internals) + + " total_weight : " + str(self.total_weight)) + + def copy(self): + """Perform a deep copy of status""" + new_status = Status() + new_status.node2com = self.node2com.copy() + new_status.internals = self.internals.copy() + new_status.degrees = self.degrees.copy() + new_status.gdegrees = self.gdegrees.copy() + new_status.total_weight = self.total_weight + + def init(self, graph, part=None): + """Initialize the status of a graph with every node in one community""" + count = 0 + self.node2com = {} + self.total_weight = 0 + self.degrees = {} + self.gdegrees = {} + self.internals = {} + self.total_weight = graph.size(weight='weight') + if part is None: + for node in graph.nodes(): + self.node2com[node] = count + deg = float(graph.degree(node, weight='weight')) + self.degrees[count] = deg + self.gdegrees[node] = deg + self.loops[node] = float(graph.get_edge_data(node, node, + {"weight": 0}).get("weight", 1)) + self.internals[count] = self.loops[node] + count = count + 1 + else: + for node in graph.nodes(): + com = part[node] + self.node2com[node] = com + deg = float(graph.degree(node, weigh='weight')) + self.degrees[com] = self.degrees.get(com, 0) + deg + self.gdegrees[node] = deg + inc = 0. + for neighbor, datas in graph[node].iteritems(): + weight = datas.get("weight", 1) + if part[neighbor] == com: + if neighbor == node: + inc += float(weight) + else: + inc += float(weight) / 2. + self.internals[com] = self.internals.get(com, 0) + inc + + +def __neighcom(node, graph, status): + """ + Compute the communities in the neighborood of node in the graph given + with the decomposition node2com + """ + weights = {} + for neighbor, datas in graph[node].iteritems(): + if neighbor != node: + weight = datas.get("weight", 1) + neighborcom = status.node2com[neighbor] + weights[neighborcom] = weights.get(neighborcom, 0) + weight + + return weights + + +def __remove(node, com, weight, status): + """ Remove node from community com and modify status""" + status.degrees[com] = (status.degrees.get(com, 0.) + - status.gdegrees.get(node, 0.)) + status.internals[com] = float(status.internals.get(com, 0.) - + weight - status.loops.get(node, 0.)) + status.node2com[node] = -1 + + +def __insert(node, com, weight, status): + """ Insert node into community and modify status""" + status.node2com[node] = com + status.degrees[com] = (status.degrees.get(com, 0.) + + status.gdegrees.get(node, 0.)) + status.internals[com] = float(status.internals.get(com, 0.) + + weight + status.loops.get(node, 0.)) + + +def __modularity(status): + """ + Compute the modularity of the partition of the graph faslty using status precomputed + """ + links = float(status.total_weight) + result = 0. + for community in set(status.node2com.values()): + in_degree = status.internals.get(community, 0.) + degree = status.degrees.get(community, 0.) + if links > 0: + result = result + in_degree / links - ((degree / ( + 2. * links)) ** 2) + return result + + +def __main(): + """Main function to mimic C++ version behavior""" + try: + filename = sys.argv[1] + graphfile = __load_binary(filename) + partition = best_partition(graphfile) + print >> sys.stderr, str(modularity(partition, graphfile)) + for elem, part in partition.iteritems(): + print str(elem) + " " + str(part) + except (IndexError, IOError): + print "Usage : ./community filename" + print "find the communities in graph filename and display the dendogram" + print "Parameters:" + print "filename is a binary file as generated by the " + print "convert utility distributed with the C implementation" + + +if __name__ == "__main__": + __main() diff --git a/linkpred/network/misc.py b/linkpred/network/misc.py new file mode 100644 index 0000000..9994fb7 --- /dev/null +++ b/linkpred/network/misc.py @@ -0,0 +1,73 @@ +import networkx as nx + +#TODO Examine if we can use nx.single-source_shortest_path_length here + + +def neighbourhood_search(G, n, k=1): + """Get k-neighbourhood of node n""" + dist = {} + dist[n] = 0 + queue = [n] + while queue: + v = queue.pop(0) + if dist[v] == k: + break + for w in G[v]: + if w not in dist: + queue.append(w) + dist[w] = dist[v] + 1 + return dist + + +def neighbourhood_graph(G, n, k=1): + """Get k-neighbourhood subgraph of node n""" + dist = neighbourhood_search(G, n, k) + return G.subgraph(dist.keys()) + + +def edge_weights(G, weight='weight'): + """Iterator over edge weights in G""" + for u, nbrdict in G.adjacency_iter(): + for edgedata in nbrdict.itervalues(): + yield edgedata[weight] + + +def from_biadjacency_matrix(A, row_items=None, col_items=None, weight='weight'): + import numpy + + kind_to_python_type = {'f': float, + 'i': int, + 'u': int, + 'b': bool, + 'c': complex, + 'S': str} + + dt = A.dtype + nrows, ncols = A.shape + try: + python_type = kind_to_python_type[dt.kind] + except: + raise TypeError("Unknown numpy data type: %s" % dt) + + if row_items is None: + row_items = range(nrows) + elif len(row_items) != nrows: + raise ValueError("Expected %d row items, but got %d instead" % + (nrows, len(row_items))) + if col_items is None: + col_items = range(nrows, nrows + ncols) + elif len(col_items) != ncols: + raise ValueError("Expected %d col items, but got %d instead" % + (ncols, len(col_items))) + + G = nx.Graph() + G.add_nodes_from(row_items) + G.add_nodes_from(col_items) + # get a list of edges + x, y = numpy.asarray(A).nonzero() + + # handle numpy constructed data type + G.add_edges_from((row_items[u], col_items[v], {weight: python_type(A[u, v])}) + for u, v in zip(x, y)) + + return G diff --git a/linkpred/network/pajek.py b/linkpred/network/pajek.py new file mode 100644 index 0000000..b309c65 --- /dev/null +++ b/linkpred/network/pajek.py @@ -0,0 +1,202 @@ +# Fork of networkx.readwrite.pajek +import csv +import networkx +from networkx.utils import is_string_like + +__all__ = ['read_pajek', 'parse_pajek', 'write_pajek'] + + +def write_pajek(G, path, weight='weight', clusterpath=None, clusterlabel='cluster'): + """Write in Pajek format to path. + + Parameters + ---------- + G : graph + A networkx graph + path : file or string + File or filename to write. + Filenames ending in .gz or .bz2 will be compressed. + weight : string + Edge attribute for edge weight + clusterpath : file or string + Optional path of partition file + clusterlabel : string + Label of the partition. Default: 'cluster' + + Examples + -------- + >>> G=nx.path_graph(4) + >>> nx.write_pajek(G, "test.net") + """ + + with open(path, mode="w") as fh: + if G.name: + fh.write("*network \"%s\"\n" % G.name) + + # write nodes with attributes + fh.write("*vertices %s\n" % G.order()) + clu = "*vertices %s\n" % G.order() + nodes = G.nodes() + # make dictionary mapping nodes to integers + nodenumber = dict(zip(nodes, range(1, len(nodes) + 1))) + clusters = {} + i = 0 + for n in nodes: + na = G.node[n].copy() + x = na.pop('x', None) + y = na.pop('y', None) + # It seems better if we just avoid the node_id in the dict altogether... + node_id = nodenumber[n] + shape = na.pop('shape', None) + fh.write("%d \"%s\" %f %f %s " % (node_id, n, + float(x), float(y), shape)) + fh.write("%d \"%s\" " % (node_id, n)) + for attr in (x, y): + if attr is not None: + fh.write("%f " % float(x)) + if shape: + fh.write("%s " % shape) + for k, v in na.iteritems(): + fh.write("%s \"%s\" " % (k, v)) + fh.write("\n") + + if clusterpath: + if G.node[n][clusterlabel] not in clusters: + i += 1 + clusters[G.node[n][clusterlabel]] = i + clu += "%d\n" % clusters[G.node[n][clusterlabel]] + + # write edges with attributes + if G.is_directed(): + fh.write("*arcs\n") + else: + fh.write("*edges\n") + for u, v, edgedata in G.edges(data=True): + d = edgedata.copy() + value = d.pop(weight, 1.0) # use 1 as default edge value + fh.write("%d %d %f" % (nodenumber[u], nodenumber[v], float(value))) + for k, v in d.iteritems(): + if is_string_like(v) and " " in v: + # add quotes to any values with a blank space + v = "\"%s\"" % v + fh.write("%s %s " % (k, v)) + fh.write("\n") + fh.close() + + if clusterpath: + with open(clusterpath, mode="w") as fh: + clusterpath.write(clu) + + +def read_pajek(path, weight='weight'): + """Read graph in Pajek format from path. + + Returns a MultiGraph or MultiDiGraph. + + Parameters + ---------- + path : file or string + File or filename to write. + Filenames ending in .gz or .bz2 will be compressed. + weight : string + Edge attribute for edge weight + + Examples + -------- + >>> G=nx.path_graph(4) + >>> nx.write_pajek(G, "test.net") + >>> G=nx.read_pajek("test.net") + + To create a Graph instead of a MultiGraph use + + >>> G1=nx.Graph(G) + + """ + with open(path) as fh: + G = parse_pajek(fh, weight=weight) + return G + + +def parse_line(l): + # XXX This is not ideal: we instantiate a new object for each line... + return csv.reader(l, delimiter=' ', skipinitialspace=True).next() + + +def parse_pajek(lines, weight='weight'): + """Parse pajek format graph from string or iterable. + + Primarily used as a helper for read_pajek(). + + See Also + -------- + read_pajek() + + """ + G = networkx.MultiDiGraph() + nodelabels = {} + nnodes = 0 + for l in lines: + if not l.split(): # Ignore empty lines + pass + elif l.startswith("*"): + if l.lower().startswith("*network"): + try: + G.name = l.split()[1] + except: + pass + elif l.lower().startswith("*vertices"): + state = "vertices" + nnodes = int(l.split()[1]) + elif l.lower().startswith("*edges"): + state = "edges" + elif l.lower().startswith("*arcs"): + state = "arcs" + elif l.lower().startswith("*matrix"): + raise NotImplementedError( + "Pajek matrix format is not yet supported.") + elif state == "vertices": + splitline = parse_line([l]) + node_id, label = splitline[0:2] + if label in G.adj: + raise networkx.NetworkXException( + "Node already added: " + label) + G.add_node(label) + nodelabels[node_id] = label + G.node[label] = {'node_id': node_id} + try: + x = float(splitline[2]) + y = float(splitline[3]) + try: + z = float(splitline[4]) + shape = splitline.pop(5) + G.node[label].update({'z': z}) + except ValueError: + shape = splitline[4] + G.node[label].update({'x': x, 'y': y, 'shape': shape}) + extra_attr = zip(splitline[5::2], splitline[6::2]) + except (ValueError, IndexError): + extra_attr = zip(splitline[2::2], splitline[3::2]) + G.node[label].update(extra_attr) + elif state == "edges" or state == "arcs": + if G.is_directed() and state == "edges": + # The Pajek format supports networks with both directed and + # edges. Since networkx does not, make this an undirected + # network as soon as we encounter one undirected edge. + G = networkx.MultiGraph(G) + splitline = l.split() + ui, vi = splitline[0:2] + u = nodelabels.get(ui, ui) + v = nodelabels.get(vi, vi) + edge_data = {} + try: + w = float(splitline[2]) + edge_data.update({weight: w}) + extra_attr = zip(splitline[3::2], splitline[4::2]) + except (ValueError, IndexError): + extra_attr = zip(splitline[2::2], splitline[3::2]) + edge_data.update(extra_attr) + G.add_edge(u, v, **edge_data) + if nnodes != len(G): + raise networkx.NetworkXException( + "Wrong number of nodes in Pajek stream!") + return G diff --git a/linkpred/network/tests/test_misc.py b/linkpred/network/tests/test_misc.py new file mode 100644 index 0000000..9935783 --- /dev/null +++ b/linkpred/network/tests/test_misc.py @@ -0,0 +1,27 @@ +import networkx as nx +from linkpred.network.misc import from_biadjacency_matrix +from nose.tools import * + +class TestMisc: + + def setup(self): + self.G = nx.bipartite_gnmk_random_graph(40, 60, 50) + self.M = nx.bipartite.biadjacency_matrix(self.G, range(40)) + + def test_biadjacency_matrix1(self): + H = from_biadjacency_matrix(self.M, range(40), range(40, 100)) + assert_equal(sorted(self.G.edges()), sorted(H.edges())) + assert_equal(sorted(self.G.nodes()), sorted(H.nodes())) + + def test_biadjacency_matrix2(self): + H = from_biadjacency_matrix(self.M) + assert_equal(sorted(self.G.edges()), sorted(H.edges())) + assert_equal(sorted(self.G.nodes()), sorted(H.nodes())) + + @raises(ValueError) + def test_biadjacency_matrix_wrong_row_items(self): + from_biadjacency_matrix(self.M, range(41), range(41, 101)) + + @raises(ValueError) + def test_biadjacency_matrix_wrong_col_items(self): + from_biadjacency_matrix(self.M, range(40), range(40, 101)) diff --git a/linkpred/predictors/__init__.py b/linkpred/predictors/__init__.py new file mode 100644 index 0000000..1c40d8b --- /dev/null +++ b/linkpred/predictors/__init__.py @@ -0,0 +1,5 @@ +from .base import * +from .eigenvector import * +from .misc import * +from .neighbour import * +from .path import * diff --git a/linkpred/predictors/base.py b/linkpred/predictors/base.py new file mode 100644 index 0000000..05c5715 --- /dev/null +++ b/linkpred/predictors/base.py @@ -0,0 +1,137 @@ +from .util import neighbourhood + +__all__ = ["Predictor", + "all_predictors"] + + +class Predictor(object): + """ + Predictor based on graph structure + + This can also be used for bipartite networks or other networks + involving nodes that should not be included in the predictions. + To distinguish between 'eligible' and 'non-eligible' nodes, the + graph can set a node attribute that returns true for eligible + nodes and false for non-eligible ones. + + For instance: + + >>> B = nx.Graph() + >>> B.add_nodes_from([1,2,3,4], bipartite=0) # Add the node attribute "bipartite" + >>> B.add_nodes_from(['a','b','c'], bipartite=1) + >>> B.add_edges_from([(1,'a'), (1,'b'), (2,'b'), (2,'c'), (3,'c'), (4,'a')]) + >>> p = Predictor(B, eligible='bipartite') + >>> p.eligible_node(1) + 0 + >>> sorted(p.eligible_nodes()) + ['a', 'b', 'c'] + + """ + def __init__(self, G, eligible=None, only_new=False): + """ + Initialize predictor + + Arguments + --------- + G : nx.Graph + a graph + + eligible : a string or None + If this is a string, it is used to distinguish between eligible + and non-eligible nodes. We only try to predict links between + two eligible nodes. + + only_new : True|False + If True, this ensures that we only predict 'new' links that are not + yet present in G. Otherwise, we predict all links, regardless of whether + or not they are in G. + + """ + self.G = G + self.eligible_attr = eligible + self.only_new = only_new + + # Add a decorator to predict(), to do the necessary postprocessing for + # filtering out new links if only_new is False. We do this in __init__() such + # that child classes need not be changed. + def add_postprocessing(func): + def predict_and_postprocess(*args, **kwargs): + scoresheet = func(*args, **kwargs) + if self.only_new: + for u, v in self.G.edges_iter(): + try: + del scoresheet[(u, v)] + except KeyError: + pass + return scoresheet + predict_and_postprocess.__name__ = func.__name__ + predict_and_postprocess.__doc__ = func.__doc__ + predict_and_postprocess.__dict__.update(func.__dict__) + return predict_and_postprocess + + self.predict = add_postprocessing(self.predict) + + def __str__(self): + if not self.name: + self.name = self.__class__.__name__ + return self.name + + def __call__(self, *args, **kwargs): + return self.predict(*args, **kwargs) + + def predict(self, *args, **kwargs): + raise NotImplementedError + + @classmethod + def arguments(cls): + import inspect + + eligible = lambda x: isinstance(x, (int, float, bool)) or x == 'weight' + a = inspect.getargspec(cls.predict) + if a.defaults: + args = {k: v for k, v in zip(a.args[1:], a.defaults) if eligible(v)} + else: + args = {} + + return args + + def eligible(self, u, v): + return self.eligible_node(u) and self.eligible_node(v) and u != v + + def eligible_node(self, v): + if self.eligible_attr is None: + return True + return self.G.node[v][self.eligible_attr] + + def eligible_nodes(self): + return [v for v in self.G if self.eligible_node(v)] + + def likely_pairs(self, k=2): + """ + Yield node pairs from the same neighbourhood + + Arguments + --------- + k : int + size of the neighbourhood (e.g., if k = 2, the neighbourhood + consists of all nodes that are two links away) + + """ + for a in self.G.nodes_iter(): + if not self.eligible_node(a): + continue + for b in neighbourhood(self.G, a, k): + if not self.eligible_node(b): + continue + yield (a, b) + + +def all_predictors(): + """ + Returns a list of all subclasses of `Predictor` + """ + from linkpred.util import itersubclasses + from operator import itemgetter + + predictors = sorted([(s, s.__name__) for s in itersubclasses(Predictor)], key=itemgetter(1)) + return zip(*predictors)[0] diff --git a/linkpred/predictors/eigenvector.py b/linkpred/predictors/eigenvector.py new file mode 100644 index 0000000..acbcdc2 --- /dev/null +++ b/linkpred/predictors/eigenvector.py @@ -0,0 +1,80 @@ +from ..evaluation import Scoresheet +from ..network import neighbourhood_graph, rooted_pagerank, simrank +from ..util import progressbar +from .base import Predictor + + +class RootedPageRank(Predictor): + def predict(self, nbunch=None, alpha=0.85, beta=0, weight='weight', k=None): + """Predict using rooted PageRank. + + Parameters + ---------- + + G : a networkx.Graph + + nbunch : iterable collection of nodes + node(s) to calculate PR for (default: all) + + alpha : float + PageRank probability that we will advance to a neighbour of the + current node in a random walk + + + beta : float or int + Normally, we return to the root node with probability 1 - alpha. + With this parameter, we can also advance to a random other node in the + network with probability beta. Thus, we get back to the root node with + probability 1 - alpha - beta. This is off (0) by default. + + weight : string or None + The edge attribute that holds the numerical value used for + the edge weight. If None then treat as unweighted. + + k : int or None + If `k` is `None`, this predictor is applied to the entire network. + If `k` is an int, the predictor is applied to a subgraph consisting + of the k-neighbourhood of the current node. + Results are often very similar but much faster. + + See documentation for linkpred.network.rooted_pagerank() for these + parameters. + + """ + res = Scoresheet() + if nbunch is None: + nbunch = self.G.nodes() + for u in progressbar(nbunch): + if not self.eligible_node(u): + continue + if k is None: + G = self.G + else: + # Restrict to the k-neighbourhood subgraph + G = neighbourhood_graph(self.G, u, k) + pagerank_scores = rooted_pagerank(G, u, alpha, beta, weight) + for v, w in pagerank_scores.iteritems(): + if w > 0 and u != v and self.eligible_node(v): + res[(u, v)] += w + return res + + +class SimRank(Predictor): + def predict(self, c=0.8, num_iterations=10, weight='weight'): + res = Scoresheet() + nodelist = self.G.nodes() + sim = simrank(self.G, nodelist, c, num_iterations, weight) + (m, n) = sim.shape + assert m == n + + for i in range(m): + # sim(a, b) = sim(b, a), leading to a 'mirrored' matrix. + # We start the column range at i + 1, such that we only look at the + # upper triangle in the matrix, excluding the diagonal: sim(a, a) = 1. + u = nodelist[i] + for j in range(i + 1, n): + if sim[i, j] > 0: + v = nodelist[j] + if self.eligible(u, v): + res[(u, v)] = sim[i, j] + return res diff --git a/linkpred/predictors/misc.py b/linkpred/predictors/misc.py new file mode 100644 index 0000000..45abc98 --- /dev/null +++ b/linkpred/predictors/misc.py @@ -0,0 +1,47 @@ +from ..evaluation import Scoresheet +from ..util import all_pairs +from .base import Predictor + +__all__ = ["Community", + "Copy", + "Random"] + + +class Community(Predictor): + def predict(self): + from collections import defaultdict + from linkpred.network import generate_dendogram, partition_at_level + + res = Scoresheet() + dendogram = generate_dendogram(self.G) + + for i in range(len(dendogram)): + partition = partition_at_level(dendogram, i) + communities = defaultdict(list) + weight = len(dendogram) - i # Lower i, smaller communities + + for n, com in partition.iteritems(): + communities[com].append(n) + for nodes in communities.itervalues(): + for u, v in all_pairs(nodes): + if not self.eligible(u, v): + continue + res[(u, v)] += weight + return res + + +class Copy(Predictor): + def predict(self, weight=None): + if weight is None: + return Scoresheet.fromkeys(self.G.edges_iter(), 1) + return Scoresheet(((u, v), d[weight]) for u, v, d in self.G.edges(data=True)) + + +class Random(Predictor): + def predict(self): + import random + + res = Scoresheet() + for a, b in all_pairs(self.eligible_nodes()): + res[(a, b)] = random.random() + return res diff --git a/linkpred/predictors/neighbour.py b/linkpred/predictors/neighbour.py new file mode 100644 index 0000000..06d11a1 --- /dev/null +++ b/linkpred/predictors/neighbour.py @@ -0,0 +1,348 @@ +import math + +from ..evaluation import Scoresheet +from ..util import all_pairs +from .base import Predictor +from .util import neighbourhood, neighbourhood_size,\ + neighbourhood_intersection_size, neighbourhood_union_size + +__all__ = ["AdamicAdar", + "AssociationStrength", + "CommonNeighbours", + "CommonKNeighbours", + "Cosine", + "DegreeProduct", + "Euclidean", + "HirschCore", + "Jaccard", + "K50", + "Manhattan", + "Minkowski", + "MaxOverlap", + "MinOverlap", + "NMeasure", + "Pearson", + "ResourceAllocation"] + + +class AdamicAdar(Predictor): + def predict(self, weight=None): + res = Scoresheet() + for a, b in self.likely_pairs(): + intersection = set(neighbourhood(self.G, a)) & \ + set(neighbourhood(self.G, b)) + w = 0 + for c in intersection: + if weight is not None: + numerator = self.G[a][c][weight] * self.G[b][c][weight] + else: + numerator = 1.0 + w += numerator / \ + math.log(neighbourhood_size(self.G, c, weight)) + if w > 0: + res[(a, b)] = w + return res + + +class AssociationStrength(Predictor): + def predict(self, weight=None): + res = Scoresheet() + for a, b in self.likely_pairs(): + w = neighbourhood_intersection_size(self.G, a, b, weight) / \ + float(neighbourhood_size(self.G, a, weight) * + neighbourhood_size(self.G, b, weight)) + if w > 0: + res[(a, b)] = w + return res + + +class CommonNeighbours(Predictor): + def predict(self, alpha=1.0, weight=None): + r"""Predict using common neighbours + + This is loosely based on Opsahl et al. (2010): + + k(u, v) = |N(u) \cap N(v)| + s(u, v) = \sum_{i=1}^n x_i \cdot y_i + w(u, v) = k(u, v)^{1 - \alpha} \cdot s(u, v)^{\alpha} + + """ + res = Scoresheet() + for a, b in self.likely_pairs(): + if weight is None or alpha == 0.0: + w = neighbourhood_intersection_size(self.G, a, b, weight=None) + elif alpha == 1.0: + w = neighbourhood_intersection_size( + self.G, a, b, weight=weight) + else: + k = neighbourhood_intersection_size(self.G, a, b, weight=None) + s = neighbourhood_intersection_size( + self.G, a, b, weight=weight) + w = (k ** (1.0 - alpha)) * (s ** alpha) + if w > 0: + res[(a, b)] = w + return res + + +class CommonKNeighbours(Predictor): + def predict(self, beta=0.01, max_k=3, weight=None): + r"""A generalized version of common neighbours, somewhat inspired by Katz + + $w(u, v) = \sum_{k=1}^\infty \beta^k |\self.Gamma_k(u) \cap \self.Gamma_k(v)|$ + + """ + res = Scoresheet() + #for a, b in all_pairs(self.G.nodes()): + for a, b in self.likely_pairs(): + w = 0 + for k in range(1, max_k + 1): + w += (beta ** k) *\ + neighbourhood_intersection_size(self.G, a, b, weight, k) + if w > 0: + res[(a, b)] = w + return res + + +class Cosine(Predictor): + def predict(self, weight=None): + res = Scoresheet() + for a, b in self.likely_pairs(): + w = neighbourhood_intersection_size(self.G, a, b, weight) / \ + math.sqrt(neighbourhood_size(self.G, a, weight) * + neighbourhood_size(self.G, b, weight)) + if w > 0: + res[(a, b)] = w + return res + + +class DegreeProduct(Predictor): + def predict(self, weight=None, minimum=1): + res = Scoresheet() + for a, b in all_pairs(self.eligible_nodes()): + w = neighbourhood_size(self.G, a, weight) *\ + neighbourhood_size(self.G, b, weight) + if w >= minimum: + res[(a, b)] = w + return res + + +class Minkowski(Predictor): + r""" + Predictor based on Minkowski distance + + The distance `d` is defined as: + + .. math:: + + d = ( \sum |x_i - x_j|^r )^{1/r} + + and hence the likelihood score `w` is: + + .. math:: + + w = \frac{1}{d} + + """ + def predict(self, r=1, weight='weight'): + + def size(G, u, v, weight=None): + if weight is None and G.has_edge(u, v): + return 1 + try: + return G[u][v][weight] + except KeyError: + return 0 + + res = Scoresheet() + for a, b in self.likely_pairs(): + nbr_a = set(neighbourhood(self.G, a)) + nbr_b = set(neighbourhood(self.G, b)) + d = sum(abs(size(self.G, a, v, weight) - size(self.G, b, v, weight)) ** r + for v in nbr_a & nbr_b) + d += sum(size(self.G, a, v, weight) ** r for v in nbr_a - nbr_b) + d += sum(size(self.G, b, v, weight) ** r for v in nbr_b - nbr_a) + d = d ** 1.0 / r + if d > 0: + # d is a distance measure, so we take the inverse + res[(a, b)] = 1.0 / d + return res + + +class Euclidean(Minkowski): + def predict(self, weight='weight'): + return Minkowski.predict(self, r=2, weight=weight) + + +class HirschCore(Predictor): + """ + Predictor based on overlap between the h-cores of nodes + + The h-index of a node n is the largest number h, such that each node has at + least h neighbours. + The h-core of a node n is then defined as the set of neighbours of n with h + or more neighbours. + + References + ---------- + Schubert, A. (2010). A reference-based Hirschian similarity measure for + journals. Scientometrics 84(1), 133-147. + + Schubert, A. & Soos, S. (2010). Mapping of science journals based on + h-similarity. Scientometrics 83(2), 589-600. + + """ + def predict(self): + + def h_core_set(G, nodes): + from hirsch import h_index + + degree_dict = {n: len(G[n]) for n in nodes} + h_degree = h_index(degree_dict.values()) + + return set(k for k, v in degree_dict.iteritems() if v >= h_degree) + + res = Scoresheet() + for a, b in self.likely_pairs(): + a_neighbours = set(neighbourhood(self.G, a)) + b_neighbours = set(neighbourhood(self.G, b)) + if a_neighbours & b_neighbours: + a_core = h_core_set(self.G, a_neighbours) + b_core = h_core_set(self.G, b_neighbours) + if a_core & b_core: + # Jaccard index of Hirsch cores or peripheries + res[(a, b)] = len( + a_core & b_core) / float(len(a_core | b_core)) + return res + + +class Jaccard(Predictor): + def predict(self, weight=None): + """Predict by Jaccard index, based on neighbours of a and b + + Jaccard index J = |A \cap B| / |A \cup B| + + """ + res = Scoresheet() + for a, b in self.likely_pairs(): + # Best performance: weighted numerator, unweighted denominator. + numerator = neighbourhood_intersection_size(self.G, a, b, weight) + denominator = neighbourhood_union_size(self.G, a, b, weight) + w = numerator / float(denominator) + if w > 0: + res[(a, b)] = w + return res + + +class K50(Predictor): + def predict(self, weight=None): + """K50, proposed by Boyack & Klavans (2006)""" + res = Scoresheet() + nbr_all = sum(neighbourhood_size(self.G, x, weight) + for x in self.G.nodes_iter()) + for a, b in self.likely_pairs(): + intersection = neighbourhood_intersection_size( + self.G, a, b, weight) + nbr_a = neighbourhood_size(self.G, a, weight) + nbr_b = neighbourhood_size(self.G, b, weight) + den = nbr_a * nbr_b + expected = min( + den / float(nbr_all - nbr_a), den / float(nbr_all - nbr_b)) + w = (intersection - expected) / math.sqrt(den) + if w > 0: + res[(a, b)] = w + return res + + +class Manhattan(Minkowski): + def predict(self, weight='weight'): + return Minkowski.predict(self, r=1, weight=weight) + + +class NMeasure(Predictor): + def predict(self, weight=None): + r"""Predict by N measure (Egghe, 2009) + + $N(A, B) = \srqt{2} \frac{|A \cap B|}{\sqrt{|A|^2 + |B|^2}}$ + + """ + res = Scoresheet() + for a, b in self.likely_pairs(): + w = math.sqrt(2) *\ + neighbourhood_intersection_size(self.G, a, b, weight) / \ + math.sqrt(neighbourhood_size(self.G, a, weight) ** 2 + + neighbourhood_size(self.G, b, weight) ** 2) + if w > 0: + res[(a, b)] = w + return res + + +class Overlap(Predictor): + def predict(self, function, weight=None): + res = Scoresheet() + for a, b in self.likely_pairs(): + # Best performance: weighted numerator, unweighted denominator. + numerator = neighbourhood_intersection_size(self.G, a, b, weight) + denominator = function(neighbourhood_size(self.G, a, weight), + neighbourhood_size(self.G, b, weight)) + w = numerator / float(denominator) + if w > 0: + res[(a, b)] = w + return res + + +class MaxOverlap(Overlap): + def predict(self, weight=None): + return Overlap.predict(self, max, weight) + + +class MinOverlap(Overlap): + def predict(self, weight=None): + return Overlap.predict(self, min, weight) + + +class Pearson(Predictor): + def predict(self, weight=None): + res = Scoresheet() + # 'Full' Pearson looks at all possible pairs. Since those are likely + # of little value for link prediction, we restrict ourselves to pairs + # with at least one common neighbour. + for a, b in self.likely_pairs(): + n = len(self.G) - 1 + a_l2norm = neighbourhood_size(self.G, a, weight) + b_l2norm = neighbourhood_size(self.G, b, weight) + a_l1norm = neighbourhood_size(self.G, a, weight, pow=1) + b_l1norm = neighbourhood_size(self.G, b, weight, pow=1) + intersect = neighbourhood_intersection_size(self.G, a, b, weight) + + numerator = (n * intersect) - (a_l1norm * b_l1norm) + denominator = math.sqrt(n * a_l2norm - a_l1norm ** 2) * \ + math.sqrt(n * b_l2norm - b_l1norm ** 2) + + w = numerator / denominator + if w > 0: + res[(a, b)] = w + return res + + +class ResourceAllocation(Predictor): + def predict(self, weight=None): + """Predict with Resource Allocation index + + See T. Zhou, L. Lu, YC. Zhang (2009). Eur. Phys. J. B, 71, 623 + + """ + res = Scoresheet() + for a, b in self.likely_pairs(): + intersection = set(neighbourhood(self.G, a)) & \ + set(neighbourhood(self.G, b)) + w = 0 + for c in intersection: + if weight is not None: + numerator = float(self.G[a][c][weight] * + self.G[b][c][weight]) + else: + numerator = 1.0 + w += numerator / neighbourhood_size(self.G, c, weight) + if w > 0: + res[(a, b)] = w + return res diff --git a/linkpred/predictors/path.py b/linkpred/predictors/path.py new file mode 100644 index 0000000..942751d --- /dev/null +++ b/linkpred/predictors/path.py @@ -0,0 +1,125 @@ +import networkx as nx + +from ..evaluation import Scoresheet +from .base import Predictor + +__all__ = ["GraphDistance", + "WeightedGraphDistance", + "Katz"] + + +class GraphDistance(Predictor): + def predict(self): + res = Scoresheet() + shortest_paths = nx.shortest_path_length(self.G) + for a, reachables in shortest_paths.iteritems(): + if not self.eligible_node(a): + continue + for b, length in reachables.iteritems(): + if a == b or not self.eligible_node(b): + continue + if length > 0: # same node + w = 1.0 / length + res[(a, b)] = w + return res + + +class WeightedGraphDistance(Predictor): + def predict(self, weight='weight', alpha=1): + r"""Predict by weighted graph distance + + This is based on the dissimilarity measures of Egghe & Rousseau (2003): + + $d(i, j) = \min(\sum 1/w_k)$ + + The parameter alpha was introduced by Opsahl et al. (2010): + + $d_\alpha(i, j) = \min(\sum 1 / w_k^\alpha)$ + + If alpha = 0, this reduces to unweighted graph distance, i.e. only keep + track of number of intermediate nodes and not of edge weights. If alpha = 1, + we only keep track of edge weights and not of the number of intermediate + nodes. (In practice, setting alpha equal to around 0.1 seems to yield the + best results.) + + """ + res = Scoresheet() + inverted = nx.Graph() + inverted.add_weighted_edges_from((u, v, 1.0 / d[weight] ** alpha) + for u, v, d in self.G.edges_iter(data=True)) + dist = nx.shortest_path_length(inverted, weight=weight) + for a, others in dist.iteritems(): + if not self.eligible_node(a): + continue + for b, length in others.iteritems(): + if a == b or not self.eligible_node(b): + continue + if a != b: + w = 1.0 / length + res[(a, b)] = w + return res + + +class Katz(Predictor): + def predict(self, beta=0.001, max_power=5, weight='weight', all_walks=True, + dtype=None): + """Predict by Katz (1953) measure + + Let $A$ be an adjacency matrix for the directed network $self.G$. + We assume that $self.G$ is unweighted, hence $A$ only contains values 1 and 0. + Then, each element $a_{ij}^{(k)}$ of $A^k$ (the $k$-th power of $A$) has a + value equal to the number of walks with length $k$ from $i$ to $j$. + + The probability of a link rapidly decreases as the walks grow longer. + Katz therefore introduces an extra parameter (here beta) to weigh + longer walks less. + + Parameters + ---------- + beta : a float + the value of beta in the formula of the Katz equation + + max_power : an int + the maximum number of powers to take into account + + weight : string or None + The edge attribute that holds the numerical value used for + the edge weight. If None then treat as unweighted. + + all_walks : True|False + can walks contain the same node/link more than once? + + dtype : a data type + data type of edge weights (default numpy.int32) + + """ + from linkpred.util import progressbar + from itertools import izip + + if dtype is None: + import numpy + dtype = numpy.int32 + + nodelist = self.G.nodes() + adj = nx.to_scipy_sparse_matrix( + self.G, dtype=dtype, weight=weight) + res = Scoresheet() + + if not all_walks: + from scipy.sparse import triu + # Make triangular upper matrix + adj = triu(adj) + + for k in progressbar(range(1, max_power + 1), "Computing matrix powers: "): + # The below method is found to be fastest for iterating through a + # sparse matrix, see + # http://stackoverflow.com/questions/4319014/iterating-through-a-scipy-sparse-vector-or-matrix + matrix = (adj ** k).tocoo() + for i, j, d in izip(matrix.row, matrix.col, matrix.data): + if i == j: + continue + u, v = nodelist[i], nodelist[j] + if self.eligible(u, v): + w = d * (beta ** k) + res[(u, v)] += w + return res diff --git a/linkpred/predictors/tests/__init__.py b/linkpred/predictors/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/linkpred/predictors/tests/test_base.py b/linkpred/predictors/tests/test_base.py new file mode 100644 index 0000000..a646c8a --- /dev/null +++ b/linkpred/predictors/tests/test_base.py @@ -0,0 +1,38 @@ +from nose.tools import assert_dict_equal, assert_equal, assert_not_in +import networkx as nx + +from linkpred.evaluation import Pair +from linkpred.predictors.neighbour import CommonNeighbours +from linkpred.predictors.misc import Copy + + +def test_bipartite_common_neighbour(): + B = nx.Graph() + B.add_nodes_from(range(1, 5), eligible=0) + B.add_nodes_from('abc', eligible=1) + B.add_edges_from([(1, 'a'), (1, 'b'), (2, 'a'), (2, 'b'), (2, 'c'), + (3, 'c'), (4, 'a')]) + + expected = {Pair('a', 'b'): 2, Pair('b', 'c'): 1, Pair('a', 'c'): 1} + assert_dict_equal(CommonNeighbours(B, eligible='eligible').predict(), expected) + + +def test_bipartite_common_neighbours_equivalent_projection(): + B = nx.bipartite_random_graph(30, 50, 0.1) + nodes = [v for v in B if B.node[v]['bipartite']] + G = nx.bipartite.weighted_projected_graph(B, nodes) + + expected = CommonNeighbours(B, eligible='bipartite')() + assert_dict_equal(Copy(G).predict(weight='weight'), expected) + + +def test_postprocessing(): + G = nx.karate_club_graph() + prediction_all_links = CommonNeighbours(G)() + prediction_only_new_links = CommonNeighbours(G, only_new=True)() + + for link, score in prediction_all_links.iteritems(): + if G.has_edge(*link): + assert_not_in(link, prediction_only_new_links) + else: + assert_equal(score, prediction_only_new_links[link]) diff --git a/linkpred/predictors/tests/test_eigenvector.py b/linkpred/predictors/tests/test_eigenvector.py new file mode 100644 index 0000000..bf468d9 --- /dev/null +++ b/linkpred/predictors/tests/test_eigenvector.py @@ -0,0 +1,30 @@ +from nose.tools import * +import networkx as nx + +from linkpred.predictors.eigenvector import * + +class TestEigenvector: + + def test_rooted_pagerank(self): + pass + + def test_rooted_pagerank_weighted(self): + pass + + def test_rooted_pagerank_alpha(self): + pass + + def test_rooted_pagerank_beta(self): + pass + + def test_rooted_pagerank_k(self): + pass + + def test_simrank(self): + pass + + def test_simrank_c(self): + pass + + def test_simrank_weighted(self): + pass diff --git a/linkpred/predictors/tests/test_misc.py b/linkpred/predictors/tests/test_misc.py new file mode 100644 index 0000000..a1e33d7 --- /dev/null +++ b/linkpred/predictors/tests/test_misc.py @@ -0,0 +1,38 @@ +from nose.tools import * +import networkx as nx + +from linkpred.evaluation import Pair +from linkpred.predictors.misc import * + +class TestCopy: + def setup(self): + self.G = nx.Graph() + self.G.add_weighted_edges_from([(0, 1, 3.0),(1, 2, 7.5)]) + + def test_copy_unweighted(self): + expected = {Pair(0, 1): 1, Pair(1, 2): 1} + assert_dict_equal(Copy(self.G).predict(), expected) + + def test_copy_weighted(self): + expected = {Pair(0, 1): 3.0, Pair(1, 2): 7.5} + assert_dict_equal(Copy(self.G).predict(weight="weight"), expected) + + def test_community(self): + pass + + def test_random(self): + G = nx.Graph() + G.add_nodes_from(range(10), eligible=True) + prediction = Random(G).predict() + assert_equal(len(prediction), 45) + + def test_random_exclude_noneligible(self): + G = nx.Graph() + G.add_nodes_from(range(5), eligible=True) + G.add_nodes_from(range(5, 10), eligible=False) + prediction = Random(G, eligible='eligible').predict() + assert_equal(len(prediction), 10) + for i in range(5): + for j in range(5): + if i != j: + assert Pair(i, j) in prediction diff --git a/linkpred/predictors/tests/test_neighbour.py b/linkpred/predictors/tests/test_neighbour.py new file mode 100644 index 0000000..733e070 --- /dev/null +++ b/linkpred/predictors/tests/test_neighbour.py @@ -0,0 +1,793 @@ +from nose.tools import * +import networkx as nx + +from linkpred.evaluation import Pair +from linkpred.predictors.neighbour import * + + +class TestFlorentineFamily: + + def setup(self): + self.G = nx.florentine_families_graph() + nx.set_node_attributes(self.G, 'eligible', dict.fromkeys(self.G, True)) + + def test_adamic_adar(self): + answer = { + Pair('Ridolfi', 'Barbadori'): 0.55811062655124721, + Pair('Medici', 'Guadagni'): 1.8204784532536746, + Pair('Peruzzi', 'Bischeri'): 0.72134752044448169, + Pair('Lamberteschi', 'Bischeri'): 0.72134752044448169, + Pair('Salviati', 'Albizzi'): 0.55811062655124721, + Pair('Lamberteschi', 'Albizzi'): 0.72134752044448169, + Pair('Peruzzi', 'Guadagni'): 0.91023922662683732, + Pair('Strozzi', 'Medici'): 0.91023922662683732, + Pair('Pazzi', 'Medici'): 1.4426950408889634, + Pair('Ridolfi', 'Albizzi'): 0.55811062655124721, + Pair('Tornabuoni', 'Lamberteschi'): 0.72134752044448169, + Pair('Tornabuoni', 'Salviati'): 0.55811062655124721, + Pair('Ridolfi', 'Acciaiuoli'): 0.55811062655124721, + Pair('Strozzi', 'Guadagni'): 0.91023922662683732, + Pair('Salviati', 'Acciaiuoli'): 0.55811062655124721, + Pair('Guadagni', 'Ginori'): 0.91023922662683732, + Pair('Strozzi', 'Barbadori'): 0.91023922662683732, + Pair('Peruzzi', 'Barbadori'): 0.91023922662683732, + Pair('Tornabuoni', 'Ridolfi'): 0.55811062655124721, + Pair('Albizzi', 'Acciaiuoli'): 0.55811062655124721, + Pair('Tornabuoni', 'Medici'): 0.91023922662683732, + Pair('Ridolfi', 'Medici'): 0.91023922662683732, + Pair('Peruzzi', 'Castellani'): 0.72134752044448169, + Pair('Tornabuoni', 'Strozzi'): 0.91023922662683732, + Pair('Tornabuoni', 'Bischeri'): 0.72134752044448169, + Pair('Barbadori', 'Albizzi'): 0.55811062655124721, + Pair('Castellani', 'Bischeri'): 1.631586747071319, + Pair('Ridolfi', 'Guadagni'): 0.91023922662683732, + Pair('Ridolfi', 'Bischeri'): 0.72134752044448169, + Pair('Ridolfi', 'Peruzzi'): 0.72134752044448169, + Pair('Medici', 'Castellani'): 1.4426950408889634, + Pair('Bischeri', 'Albizzi'): 0.72134752044448169, + Pair('Medici', 'Ginori'): 0.91023922662683732, + Pair('Salviati', 'Ridolfi'): 0.55811062655124721, + Pair('Tornabuoni', 'Barbadori'): 0.55811062655124721, + Pair('Strozzi', 'Castellani'): 0.91023922662683732, + Pair('Salviati', 'Barbadori'): 0.55811062655124721, + Pair('Strozzi', 'Peruzzi'): 1.8204784532536746, + Pair('Strozzi', 'Bischeri'): 0.91023922662683732, + Pair('Tornabuoni', 'Albizzi'): 1.2794581469957289, + Pair('Barbadori', 'Acciaiuoli'): 0.55811062655124721, + Pair('Ridolfi', 'Castellani'): 0.72134752044448169, + Pair('Tornabuoni', 'Acciaiuoli'): 0.55811062655124721} + assert_dict_equal(AdamicAdar(self.G).predict(), answer) + + def test_adamic_adar_weighted(self): + pass + + def test_association_strength(self): + answer = { + Pair('Ridolfi', 'Barbadori'): 0.16666666666666666, + Pair('Medici', 'Guadagni'): 0.083333333333333329, + Pair('Peruzzi', 'Bischeri'): 0.1111111111111111, + Pair('Lamberteschi', 'Bischeri'): 0.33333333333333331, + Pair('Salviati', 'Albizzi'): 0.16666666666666666, + Pair('Lamberteschi', 'Albizzi'): 0.33333333333333331, + Pair('Peruzzi', 'Guadagni'): 0.083333333333333329, + Pair('Strozzi', 'Medici'): 0.041666666666666664, + Pair('Pazzi', 'Medici'): 0.16666666666666666, + Pair('Ridolfi', 'Albizzi'): 0.1111111111111111, + Pair('Tornabuoni', 'Lamberteschi'): 0.33333333333333331, + Pair('Tornabuoni', 'Salviati'): 0.16666666666666666, + Pair('Ridolfi', 'Acciaiuoli'): 0.33333333333333331, + Pair('Strozzi', 'Guadagni'): 0.0625, + Pair('Salviati', 'Acciaiuoli'): 0.5, + Pair('Guadagni', 'Ginori'): 0.25, + Pair('Strozzi', 'Barbadori'): 0.125, + Pair('Peruzzi', 'Barbadori'): 0.16666666666666666, + Pair('Tornabuoni', 'Ridolfi'): 0.1111111111111111, + Pair('Albizzi', 'Acciaiuoli'): 0.33333333333333331, + Pair('Tornabuoni', 'Medici'): 0.055555555555555552, + Pair('Ridolfi', 'Medici'): 0.055555555555555552, + Pair('Peruzzi', 'Castellani'): 0.1111111111111111, + Pair('Tornabuoni', 'Strozzi'): 0.083333333333333329, + Pair('Tornabuoni', 'Bischeri'): 0.1111111111111111, + Pair('Barbadori', 'Albizzi'): 0.16666666666666666, + Pair('Castellani', 'Bischeri'): 0.22222222222222221, + Pair('Ridolfi', 'Guadagni'): 0.083333333333333329, + Pair('Ridolfi', 'Bischeri'): 0.1111111111111111, + Pair('Ridolfi', 'Peruzzi'): 0.1111111111111111, + Pair('Medici', 'Castellani'): 0.055555555555555552, + Pair('Bischeri', 'Albizzi'): 0.1111111111111111, + Pair('Medici', 'Ginori'): 0.16666666666666666, + Pair('Salviati', 'Ridolfi'): 0.16666666666666666, + Pair('Tornabuoni', 'Barbadori'): 0.16666666666666666, + Pair('Strozzi', 'Castellani'): 0.083333333333333329, + Pair('Salviati', 'Barbadori'): 0.25, + Pair('Strozzi', 'Peruzzi'): 0.16666666666666666, + Pair('Strozzi', 'Bischeri'): 0.083333333333333329, + Pair('Tornabuoni', 'Albizzi'): 0.22222222222222221, + Pair('Barbadori', 'Acciaiuoli'): 0.5, + Pair('Ridolfi', 'Castellani'): 0.1111111111111111, + Pair('Tornabuoni', 'Acciaiuoli'): 0.33333333333333331} + assert_dict_equal(AssociationStrength(self.G).predict(), answer) + + def test_association_strength_weighted(self): + pass + + def test_chi_square(self): + pass + + def test_common_neighbours(self): + answer = { + Pair('Ridolfi', 'Barbadori'): 1.0, + Pair('Medici', 'Guadagni'): 2.0, + Pair('Peruzzi', 'Bischeri'): 1.0, + Pair('Lamberteschi', 'Bischeri'): 1.0, + Pair('Salviati', 'Albizzi'): 1.0, + Pair('Lamberteschi', 'Albizzi'): 1.0, + Pair('Peruzzi', 'Guadagni'): 1.0, + Pair('Strozzi', 'Medici'): 1.0, + Pair('Pazzi', 'Medici'): 1.0, + Pair('Ridolfi', 'Albizzi'): 1.0, + Pair('Tornabuoni', 'Lamberteschi'): 1.0, + Pair('Tornabuoni', 'Salviati'): 1.0, + Pair('Ridolfi', 'Acciaiuoli'): 1.0, + Pair('Strozzi', 'Guadagni'): 1.0, + Pair('Salviati', 'Acciaiuoli'): 1.0, + Pair('Guadagni', 'Ginori'): 1.0, + Pair('Strozzi', 'Barbadori'): 1.0, + Pair('Peruzzi', 'Barbadori'): 1.0, + Pair('Tornabuoni', 'Ridolfi'): 1.0, + Pair('Albizzi', 'Acciaiuoli'): 1.0, + Pair('Tornabuoni', 'Medici'): 1.0, + Pair('Ridolfi', 'Medici'): 1.0, + Pair('Peruzzi', 'Castellani'): 1.0, + Pair('Tornabuoni', 'Strozzi'): 1.0, + Pair('Tornabuoni', 'Bischeri'): 1.0, + Pair('Barbadori', 'Albizzi'): 1.0, + Pair('Castellani', 'Bischeri'): 2.0, + Pair('Ridolfi', 'Guadagni'): 1.0, + Pair('Ridolfi', 'Bischeri'): 1.0, + Pair('Ridolfi', 'Peruzzi'): 1.0, + Pair('Medici', 'Castellani'): 1.0, + Pair('Bischeri', 'Albizzi'): 1.0, + Pair('Medici', 'Ginori'): 1.0, + Pair('Salviati', 'Ridolfi'): 1.0, + Pair('Tornabuoni', 'Barbadori'): 1.0, + Pair('Strozzi', 'Castellani'): 1.0, + Pair('Salviati', 'Barbadori'): 1.0, + Pair('Strozzi', 'Peruzzi'): 2.0, + Pair('Strozzi', 'Bischeri'): 1.0, + Pair('Tornabuoni', 'Albizzi'): 2.0, + Pair('Barbadori', 'Acciaiuoli'): 1.0, + Pair('Ridolfi', 'Castellani'): 1.0, + Pair('Tornabuoni', 'Acciaiuoli'): 1.0} + assert_dict_equal(CommonNeighbours(self.G).predict(), answer) + + def test_common_neighbours_alpha(self): + pass + + def test_common_k_neighbours(self): + answer = { + Pair('Ridolfi', 'Barbadori'): 0.010812, + Pair('Medici', 'Guadagni'): 0.020512000000000002, + Pair('Peruzzi', 'Bischeri'): 0.010409, + Pair('Lamberteschi', 'Bischeri'): 0.010307999999999999, + Pair('Salviati', 'Albizzi'): 0.01051, + Pair('Lamberteschi', 'Guadagni'): 0.00030800000000000001, + Pair('Lamberteschi', 'Albizzi'): 0.010307999999999999, + Pair('Peruzzi', 'Guadagni'): 0.010309, + Pair('Salviati', 'Pazzi'): 0.000106, + Pair('Strozzi', 'Medici'): 0.010511000000000001, + Pair('Pazzi', 'Medici'): 0.010106, + Pair('Strozzi', 'Barbadori'): 0.01051, + Pair('Strozzi', 'Ridolfi'): 0.00071100000000000004, + Pair('Tornabuoni', 'Lamberteschi'): 0.010307999999999999, + Pair('Tornabuoni', 'Salviati'): 0.01051, + Pair('Ridolfi', 'Acciaiuoli'): 0.01051, + Pair('Strozzi', 'Guadagni'): 0.010511000000000001, + Pair('Medici', 'Acciaiuoli'): 0.00051000000000000004, + Pair('Guadagni', 'Ginori'): 0.010209000000000001, + Pair('Castellani', 'Barbadori'): 0.00041000000000000005, + Pair('Guadagni', 'Bischeri'): 0.00061000000000000008, + Pair('Ridolfi', 'Albizzi'): 0.010713, + Pair('Barbadori', 'Albizzi'): 0.010512000000000001, + Pair('Medici', 'Barbadori'): 0.00071199999999999996, + Pair('Peruzzi', 'Barbadori'): 0.010307999999999999, + Pair('Strozzi', 'Bischeri'): 0.010509000000000001, + Pair('Albizzi', 'Acciaiuoli'): 0.01051, + Pair('Tornabuoni', 'Medici'): 0.010713, + Pair('Guadagni', 'Albizzi'): 0.00061200000000000002, + Pair('Ridolfi', 'Medici'): 0.010813000000000001, + Pair('Peruzzi', 'Castellani'): 0.010407999999999999, + Pair('Tornabuoni', 'Strozzi'): 0.010511000000000001, + Pair('Tornabuoni', 'Bischeri'): 0.01051, + Pair('Medici', 'Albizzi'): 0.00071299999999999998, + Pair('Castellani', 'Bischeri'): 0.020308000000000003, + Pair('Salviati', 'Barbadori'): 0.01051, + Pair('Ridolfi', 'Guadagni'): 0.010612, + Pair('Ridolfi', 'Bischeri'): 0.01061, + Pair('Ridolfi', 'Peruzzi'): 0.010509000000000001, + Pair('Tornabuoni', 'Guadagni'): 0.00061200000000000002, + Pair('Bischeri', 'Albizzi'): 0.010409999999999999, + Pair('Ridolfi', 'Castellani'): 0.01051, + Pair('Medici', 'Ginori'): 0.010209000000000001, + Pair('Salviati', 'Ridolfi'): 0.01051, + Pair('Tornabuoni', 'Barbadori'): 0.010612, + Pair('Salviati', 'Acciaiuoli'): 0.01051, + Pair('Strozzi', 'Castellani'): 0.01051, + Pair('Salviati', 'Medici'): 0.00061000000000000008, + Pair('Strozzi', 'Peruzzi'): 0.020508999999999999, + Pair('Tornabuoni', 'Ridolfi'): 0.010813000000000001, + Pair('Tornabuoni', 'Albizzi'): 0.020812999999999998, + Pair('Barbadori', 'Acciaiuoli'): 0.01051, + Pair('Medici', 'Castellani'): 0.01031, + Pair('Tornabuoni', 'Acciaiuoli'): 0.01051, + Pair('Ginori', 'Albizzi'): 0.00020900000000000001} + assert_dict_equal(CommonKNeighbours(self.G).predict(), answer) + + def test_cosine(self): + answer = { + Pair('Ridolfi', 'Barbadori'): 0.40824829046386307, + Pair('Medici', 'Guadagni'): 0.40824829046386307, + Pair('Peruzzi', 'Bischeri'): 0.33333333333333331, + Pair('Lamberteschi', 'Bischeri'): 0.57735026918962584, + Pair('Salviati', 'Albizzi'): 0.40824829046386307, + Pair('Lamberteschi', 'Albizzi'): 0.57735026918962584, + Pair('Peruzzi', 'Guadagni'): 0.28867513459481292, + Pair('Strozzi', 'Medici'): 0.20412414523193154, + Pair('Pazzi', 'Medici'): 0.40824829046386307, + Pair('Ridolfi', 'Albizzi'): 0.33333333333333331, + Pair('Tornabuoni', 'Lamberteschi'): 0.57735026918962584, + Pair('Tornabuoni', 'Salviati'): 0.40824829046386307, + Pair('Ridolfi', 'Acciaiuoli'): 0.57735026918962584, + Pair('Strozzi', 'Guadagni'): 0.25, + Pair('Salviati', 'Acciaiuoli'): 0.70710678118654746, + Pair('Guadagni', 'Ginori'): 0.5, + Pair('Strozzi', 'Barbadori'): 0.35355339059327373, + Pair('Peruzzi', 'Barbadori'): 0.40824829046386307, + Pair('Tornabuoni', 'Ridolfi'): 0.33333333333333331, + Pair('Albizzi', 'Acciaiuoli'): 0.57735026918962584, + Pair('Tornabuoni', 'Medici'): 0.23570226039551587, + Pair('Ridolfi', 'Medici'): 0.23570226039551587, + Pair('Peruzzi', 'Castellani'): 0.33333333333333331, + Pair('Tornabuoni', 'Strozzi'): 0.28867513459481292, + Pair('Tornabuoni', 'Bischeri'): 0.33333333333333331, + Pair('Barbadori', 'Albizzi'): 0.40824829046386307, + Pair('Castellani', 'Bischeri'): 0.66666666666666663, + Pair('Ridolfi', 'Guadagni'): 0.28867513459481292, + Pair('Ridolfi', 'Bischeri'): 0.33333333333333331, + Pair('Ridolfi', 'Peruzzi'): 0.33333333333333331, + Pair('Medici', 'Castellani'): 0.23570226039551587, + Pair('Bischeri', 'Albizzi'): 0.33333333333333331, + Pair('Medici', 'Ginori'): 0.40824829046386307, + Pair('Salviati', 'Ridolfi'): 0.40824829046386307, + Pair('Tornabuoni', 'Barbadori'): 0.40824829046386307, + Pair('Strozzi', 'Castellani'): 0.28867513459481292, + Pair('Salviati', 'Barbadori'): 0.5, + Pair('Strozzi', 'Peruzzi'): 0.57735026918962584, + Pair('Strozzi', 'Bischeri'): 0.28867513459481292, + Pair('Tornabuoni', 'Albizzi'): 0.66666666666666663, + Pair('Barbadori', 'Acciaiuoli'): 0.70710678118654746, + Pair('Ridolfi', 'Castellani'): 0.33333333333333331, + Pair('Tornabuoni', 'Acciaiuoli'): 0.57735026918962584} + assert_dict_equal(Cosine(self.G).predict(), answer) + + def test_cosine_weighted(self): + pass + + def test_degree_product(self): + answer = { + Pair('Peruzzi', 'Bischeri'): 9.0, + Pair('Lamberteschi', 'Albizzi'): 3.0, + Pair('Tornabuoni', 'Ginori'): 3.0, + Pair('Salviati', 'Pazzi'): 2.0, + Pair('Guadagni', 'Castellani'): 12.0, + Pair('Tornabuoni', 'Castellani'): 9.0, + Pair('Castellani', 'Albizzi'): 9.0, + Pair('Ginori', 'Barbadori'): 2.0, + Pair('Pazzi', 'Guadagni'): 4.0, + Pair('Castellani', 'Barbadori'): 6.0, + Pair('Lamberteschi', 'Acciaiuoli'): 1.0, + Pair('Ginori', 'Acciaiuoli'): 1.0, + Pair('Lamberteschi', 'Ginori'): 1.0, + Pair('Peruzzi', 'Barbadori'): 6.0, + Pair('Medici', 'Castellani'): 18.0, + Pair('Ginori', 'Castellani'): 3.0, + Pair('Guadagni', 'Barbadori'): 8.0, + Pair('Salviati', 'Medici'): 12.0, + Pair('Ridolfi', 'Lamberteschi'): 3.0, + Pair('Salviati', 'Ginori'): 2.0, + Pair('Salviati', 'Barbadori'): 4.0, + Pair('Strozzi', 'Pazzi'): 4.0, + Pair('Pazzi', 'Acciaiuoli'): 1.0, + Pair('Tornabuoni', 'Medici'): 18.0, + Pair('Strozzi', 'Albizzi'): 12.0, + Pair('Guadagni', 'Acciaiuoli'): 4.0, + Pair('Lamberteschi', 'Bischeri'): 3.0, + Pair('Ridolfi', 'Ginori'): 3.0, + Pair('Castellani', 'Bischeri'): 9.0, + Pair('Strozzi', 'Medici'): 24.0, + Pair('Bischeri', 'Acciaiuoli'): 3.0, + Pair('Strozzi', 'Guadagni'): 16.0, + Pair('Medici', 'Acciaiuoli'): 6.0, + Pair('Medici', 'Albizzi'): 18.0, + Pair('Pazzi', 'Albizzi'): 3.0, + Pair('Peruzzi', 'Medici'): 18.0, + Pair('Guadagni', 'Albizzi'): 12.0, + Pair('Strozzi', 'Acciaiuoli'): 4.0, + Pair('Bischeri', 'Barbadori'): 6.0, + Pair('Peruzzi', 'Castellani'): 9.0, + Pair('Strozzi', 'Ridolfi'): 12.0, + Pair('Barbadori', 'Albizzi'): 6.0, + Pair('Ridolfi', 'Peruzzi'): 9.0, + Pair('Bischeri', 'Albizzi'): 9.0, + Pair('Ridolfi', 'Barbadori'): 6.0, + Pair('Peruzzi', 'Pazzi'): 3.0, + Pair('Strozzi', 'Peruzzi'): 12.0, + Pair('Pazzi', 'Ginori'): 1.0, + Pair('Medici', 'Lamberteschi'): 6.0, + Pair('Strozzi', 'Bischeri'): 12.0, + Pair('Salviati', 'Lamberteschi'): 2.0, + Pair('Ridolfi', 'Castellani'): 9.0, + Pair('Peruzzi', 'Lamberteschi'): 3.0, + Pair('Ginori', 'Albizzi'): 3.0, + Pair('Peruzzi', 'Guadagni'): 12.0, + Pair('Strozzi', 'Lamberteschi'): 4.0, + Pair('Medici', 'Guadagni'): 24.0, + Pair('Salviati', 'Bischeri'): 6.0, + Pair('Tornabuoni', 'Salviati'): 6.0, + Pair('Medici', 'Barbadori'): 12.0, + Pair('Guadagni', 'Bischeri'): 12.0, + Pair('Salviati', 'Ridolfi'): 6.0, + Pair('Salviati', 'Peruzzi'): 6.0, + Pair('Pazzi', 'Barbadori'): 2.0, + Pair('Ridolfi', 'Medici'): 18.0, + Pair('Ridolfi', 'Guadagni'): 12.0, + Pair('Ridolfi', 'Bischeri'): 9.0, + Pair('Tornabuoni', 'Guadagni'): 12.0, + Pair('Castellani', 'Acciaiuoli'): 3.0, + Pair('Tornabuoni', 'Barbadori'): 6.0, + Pair('Ginori', 'Bischeri'): 3.0, + Pair('Lamberteschi', 'Castellani'): 3.0, + Pair('Tornabuoni', 'Albizzi'): 9.0, + Pair('Salviati', 'Guadagni'): 8.0, + Pair('Tornabuoni', 'Pazzi'): 3.0, + Pair('Salviati', 'Albizzi'): 6.0, + Pair('Lamberteschi', 'Guadagni'): 4.0, + Pair('Ridolfi', 'Pazzi'): 3.0, + Pair('Peruzzi', 'Albizzi'): 9.0, + Pair('Strozzi', 'Salviati'): 8.0, + Pair('Strozzi', 'Barbadori'): 8.0, + Pair('Tornabuoni', 'Lamberteschi'): 3.0, + Pair('Pazzi', 'Medici'): 6.0, + Pair('Ridolfi', 'Acciaiuoli'): 3.0, + Pair('Guadagni', 'Ginori'): 4.0, + Pair('Ridolfi', 'Albizzi'): 9.0, + Pair('Albizzi', 'Acciaiuoli'): 3.0, + Pair('Tornabuoni', 'Strozzi'): 12.0, + Pair('Tornabuoni', 'Bischeri'): 9.0, + Pair('Tornabuoni', 'Peruzzi'): 9.0, + Pair('Salviati', 'Castellani'): 6.0, + Pair('Peruzzi', 'Ginori'): 3.0, + Pair('Medici', 'Ginori'): 6.0, + Pair('Peruzzi', 'Acciaiuoli'): 3.0, + Pair('Pazzi', 'Lamberteschi'): 1.0, + Pair('Pazzi', 'Castellani'): 3.0, + Pair('Strozzi', 'Castellani'): 12.0, + Pair('Lamberteschi', 'Barbadori'): 2.0, + Pair('Salviati', 'Acciaiuoli'): 2.0, + Pair('Pazzi', 'Bischeri'): 3.0, + Pair('Strozzi', 'Ginori'): 4.0, + Pair('Tornabuoni', 'Ridolfi'): 9.0, + Pair('Barbadori', 'Acciaiuoli'): 2.0, + Pair('Tornabuoni', 'Acciaiuoli'): 3.0, + Pair('Medici', 'Bischeri'): 18.0} + assert_dict_equal(DegreeProduct(self.G).predict(), answer) + + def test_degree_product_weighted(self): + pass + + def test_minkowski(self): + pass + + def test_euclidean(self): + pass + + def test_hirsch_core(self): + answer = { + Pair('Ridolfi', 'Barbadori'): 0.25, + Pair('Medici', 'Guadagni'): 0.5, + Pair('Peruzzi', 'Bischeri'): 0.20000000000000001, + Pair('Lamberteschi', 'Bischeri'): 0.33333333333333331, + Pair('Salviati', 'Albizzi'): 0.33333333333333331, + Pair('Lamberteschi', 'Albizzi'): 0.5, + Pair('Peruzzi', 'Guadagni'): 0.20000000000000001, + Pair('Strozzi', 'Medici'): 0.16666666666666666, + Pair('Ridolfi', 'Albizzi'): 0.25, + Pair('Tornabuoni', 'Lamberteschi'): 0.33333333333333331, + Pair('Tornabuoni', 'Salviati'): 0.25, + Pair('Ridolfi', 'Acciaiuoli'): 0.33333333333333331, + Pair('Strozzi', 'Guadagni'): 0.16666666666666666, + Pair('Salviati', 'Acciaiuoli'): 0.5, + Pair('Guadagni', 'Ginori'): 0.33333333333333331, + Pair('Strozzi', 'Barbadori'): 0.20000000000000001, + Pair('Peruzzi', 'Barbadori'): 0.25, + Pair('Tornabuoni', 'Ridolfi'): 0.20000000000000001, + Pair('Albizzi', 'Acciaiuoli'): 0.5, + Pair('Tornabuoni', 'Medici'): 0.20000000000000001, + Pair('Ridolfi', 'Medici'): 0.20000000000000001, + Pair('Peruzzi', 'Castellani'): 0.20000000000000001, + Pair('Tornabuoni', 'Strozzi'): 0.16666666666666666, + Pair('Tornabuoni', 'Bischeri'): 0.20000000000000001, + Pair('Barbadori', 'Albizzi'): 0.33333333333333331, + Pair('Castellani', 'Bischeri'): 0.5, + Pair('Ridolfi', 'Guadagni'): 0.20000000000000001, + Pair('Ridolfi', 'Bischeri'): 0.20000000000000001, + Pair('Ridolfi', 'Peruzzi'): 0.20000000000000001, + Pair('Bischeri', 'Albizzi'): 0.25, + Pair('Medici', 'Ginori'): 0.33333333333333331, + Pair('Salviati', 'Ridolfi'): 0.25, + Pair('Tornabuoni', 'Barbadori'): 0.25, + Pair('Strozzi', 'Castellani'): 0.16666666666666666, + Pair('Salviati', 'Barbadori'): 0.33333333333333331, + Pair('Strozzi', 'Peruzzi'): 0.40000000000000002, + Pair('Strozzi', 'Bischeri'): 0.16666666666666666, + Pair('Tornabuoni', 'Albizzi'): 0.66666666666666663, + Pair('Barbadori', 'Acciaiuoli'): 0.5, + Pair('Ridolfi', 'Castellani'): 0.20000000000000001, + Pair('Tornabuoni', 'Acciaiuoli'): 0.33333333333333331} + assert_dict_equal(HirschCore(self.G).predict(), answer) + + def test_jaccard(self): + answer = { + Pair('Ridolfi', 'Barbadori'): 0.25, + Pair('Medici', 'Guadagni'): 0.25, + Pair('Peruzzi', 'Bischeri'): 0.20000000000000001, + Pair('Lamberteschi', 'Bischeri'): 0.33333333333333331, + Pair('Salviati', 'Albizzi'): 0.25, + Pair('Lamberteschi', 'Albizzi'): 0.33333333333333331, + Pair('Peruzzi', 'Guadagni'): 0.16666666666666666, + Pair('Strozzi', 'Medici'): 0.1111111111111111, + Pair('Pazzi', 'Medici'): 0.16666666666666666, + Pair('Ridolfi', 'Albizzi'): 0.20000000000000001, + Pair('Tornabuoni', 'Lamberteschi'): 0.33333333333333331, + Pair('Tornabuoni', 'Salviati'): 0.25, + Pair('Ridolfi', 'Acciaiuoli'): 0.33333333333333331, + Pair('Strozzi', 'Guadagni'): 0.14285714285714285, + Pair('Salviati', 'Acciaiuoli'): 0.5, + Pair('Guadagni', 'Ginori'): 0.25, + Pair('Strozzi', 'Barbadori'): 0.20000000000000001, + Pair('Peruzzi', 'Barbadori'): 0.25, + Pair('Tornabuoni', 'Ridolfi'): 0.20000000000000001, + Pair('Albizzi', 'Acciaiuoli'): 0.33333333333333331, + Pair('Tornabuoni', 'Medici'): 0.125, + Pair('Ridolfi', 'Medici'): 0.125, + Pair('Peruzzi', 'Castellani'): 0.20000000000000001, + Pair('Tornabuoni', 'Strozzi'): 0.16666666666666666, + Pair('Tornabuoni', 'Bischeri'): 0.20000000000000001, + Pair('Barbadori', 'Albizzi'): 0.25, + Pair('Castellani', 'Bischeri'): 0.5, + Pair('Ridolfi', 'Guadagni'): 0.16666666666666666, + Pair('Ridolfi', 'Bischeri'): 0.20000000000000001, + Pair('Ridolfi', 'Peruzzi'): 0.20000000000000001, + Pair('Medici', 'Castellani'): 0.125, + Pair('Bischeri', 'Albizzi'): 0.20000000000000001, + Pair('Medici', 'Ginori'): 0.16666666666666666, + Pair('Salviati', 'Ridolfi'): 0.25, + Pair('Tornabuoni', 'Barbadori'): 0.25, + Pair('Strozzi', 'Castellani'): 0.16666666666666666, + Pair('Salviati', 'Barbadori'): 0.33333333333333331, + Pair('Strozzi', 'Peruzzi'): 0.40000000000000002, + Pair('Strozzi', 'Bischeri'): 0.16666666666666666, + Pair('Tornabuoni', 'Albizzi'): 0.5, + Pair('Barbadori', 'Acciaiuoli'): 0.5, + Pair('Ridolfi', 'Castellani'): 0.20000000000000001, + Pair('Tornabuoni', 'Acciaiuoli'): 0.33333333333333331} + assert_dict_equal(Jaccard(self.G).predict(), answer) + + def test_jaccard_weighted(self): + pass + + def test_k50(self): + answer = { + Pair('Ridolfi', 'Barbadori'): 0.34378803407483205, + Pair('Medici', 'Guadagni'): 0.27216552697590873, + Pair('Peruzzi', 'Bischeri'): 0.25225225225225228, + Pair('Lamberteschi', 'Bischeri'): 0.53293871002119308, + Pair('Salviati', 'Albizzi'): 0.34378803407483205, + Pair('Lamberteschi', 'Albizzi'): 0.53293871002119308, + Pair('Peruzzi', 'Guadagni'): 0.19505076661811682, + Pair('Strozzi', 'Medici'): 0.068041381743977183, + Pair('Pazzi', 'Medici'): 0.34544086116173028, + Pair('Ridolfi', 'Albizzi'): 0.25225225225225228, + Pair('Tornabuoni', 'Lamberteschi'): 0.53293871002119308, + Pair('Tornabuoni', 'Salviati'): 0.34378803407483205, + Pair('Ridolfi', 'Acciaiuoli'): 0.53293871002119308, + Pair('Strozzi', 'Guadagni'): 0.1388888888888889, + Pair('Salviati', 'Acciaiuoli'): 0.67084489497185273, + Pair('Guadagni', 'Ginori'): 0.44871794871794873, + Pair('Strozzi', 'Barbadori'): 0.27912109783679506, + Pair('Peruzzi', 'Barbadori'): 0.34378803407483205, + Pair('Tornabuoni', 'Ridolfi'): 0.25225225225225228, + Pair('Albizzi', 'Acciaiuoli'): 0.53293871002119308, + Pair('Tornabuoni', 'Medici'): 0.12103629587877841, + Pair('Ridolfi', 'Medici'): 0.12103629587877841, + Pair('Peruzzi', 'Castellani'): 0.25225225225225228, + Pair('Tornabuoni', 'Strozzi'): 0.19505076661811682, + Pair('Tornabuoni', 'Bischeri'): 0.25225225225225228, + Pair('Barbadori', 'Albizzi'): 0.34378803407483205, + Pair('Castellani', 'Bischeri'): 0.5855855855855856, + Pair('Ridolfi', 'Guadagni'): 0.19505076661811682, + Pair('Ridolfi', 'Bischeri'): 0.25225225225225228, + Pair('Ridolfi', 'Peruzzi'): 0.25225225225225228, + Pair('Medici', 'Castellani'): 0.12103629587877841, + Pair('Bischeri', 'Albizzi'): 0.25225225225225228, + Pair('Medici', 'Ginori'): 0.34544086116173028, + Pair('Salviati', 'Ridolfi'): 0.34378803407483205, + Pair('Tornabuoni', 'Barbadori'): 0.34378803407483205, + Pair('Strozzi', 'Castellani'): 0.19505076661811682, + Pair('Salviati', 'Barbadori'): 0.44736842105263158, + Pair('Strozzi', 'Peruzzi'): 0.48372590121292974, + Pair('Strozzi', 'Bischeri'): 0.19505076661811682, + Pair('Tornabuoni', 'Albizzi'): 0.5855855855855856, + Pair('Barbadori', 'Acciaiuoli'): 0.67084489497185273, + Pair('Ridolfi', 'Castellani'): 0.25225225225225228, + Pair('Tornabuoni', 'Acciaiuoli'): 0.53293871002119308} + assert_dict_equal(K50(self.G).predict(), answer) + + def test_manhattan(self): + pass + + def test_n_measure(self): + answer = { + Pair('Ridolfi', 'Barbadori'): 0.39223227027636809, + Pair('Medici', 'Guadagni'): 0.39223227027636809, + Pair('Peruzzi', 'Bischeri'): 0.33333333333333337, + Pair('Lamberteschi', 'Bischeri'): 0.44721359549995793, + Pair('Salviati', 'Albizzi'): 0.39223227027636809, + Pair('Lamberteschi', 'Albizzi'): 0.44721359549995793, + Pair('Peruzzi', 'Guadagni'): 0.28284271247461901, + Pair('Strozzi', 'Medici'): 0.19611613513818404, + Pair('Pazzi', 'Medici'): 0.2324952774876386, + Pair('Ridolfi', 'Albizzi'): 0.33333333333333337, + Pair('Tornabuoni', 'Lamberteschi'): 0.44721359549995793, + Pair('Tornabuoni', 'Salviati'): 0.39223227027636809, + Pair('Ridolfi', 'Acciaiuoli'): 0.44721359549995793, + Pair('Strozzi', 'Guadagni'): 0.25, + Pair('Salviati', 'Acciaiuoli'): 0.63245553203367588, + Pair('Guadagni', 'Ginori'): 0.34299717028501769, + Pair('Strozzi', 'Barbadori'): 0.31622776601683794, + Pair('Peruzzi', 'Barbadori'): 0.39223227027636809, + Pair('Tornabuoni', 'Ridolfi'): 0.33333333333333337, + Pair('Albizzi', 'Acciaiuoli'): 0.44721359549995793, + Pair('Tornabuoni', 'Medici'): 0.21081851067789195, + Pair('Ridolfi', 'Medici'): 0.21081851067789195, + Pair('Peruzzi', 'Castellani'): 0.33333333333333337, + Pair('Tornabuoni', 'Strozzi'): 0.28284271247461901, + Pair('Tornabuoni', 'Bischeri'): 0.33333333333333337, + Pair('Barbadori', 'Albizzi'): 0.39223227027636809, + Pair('Castellani', 'Bischeri'): 0.66666666666666674, + Pair('Ridolfi', 'Guadagni'): 0.28284271247461901, + Pair('Ridolfi', 'Bischeri'): 0.33333333333333337, + Pair('Ridolfi', 'Peruzzi'): 0.33333333333333337, + Pair('Medici', 'Castellani'): 0.21081851067789195, + Pair('Bischeri', 'Albizzi'): 0.33333333333333337, + Pair('Medici', 'Ginori'): 0.2324952774876386, + Pair('Salviati', 'Ridolfi'): 0.39223227027636809, + Pair('Tornabuoni', 'Barbadori'): 0.39223227027636809, + Pair('Strozzi', 'Castellani'): 0.28284271247461901, + Pair('Salviati', 'Barbadori'): 0.5, + Pair('Strozzi', 'Peruzzi'): 0.56568542494923801, + Pair('Strozzi', 'Bischeri'): 0.28284271247461901, + Pair('Tornabuoni', 'Albizzi'): 0.66666666666666674, + Pair('Barbadori', 'Acciaiuoli'): 0.63245553203367588, + Pair('Ridolfi', 'Castellani'): 0.33333333333333337, + Pair('Tornabuoni', 'Acciaiuoli'): 0.44721359549995793} + assert_dict_equal(NMeasure(self.G).predict(), answer) + + def test_n_measure_weighted(self): + pass + + def test_max_overlap(self): + answer = { + Pair('Ridolfi', 'Barbadori'): 0.33333333333333331, + Pair('Medici', 'Guadagni'): 0.33333333333333331, + Pair('Peruzzi', 'Bischeri'): 0.33333333333333331, + Pair('Lamberteschi', 'Bischeri'): 0.33333333333333331, + Pair('Salviati', 'Albizzi'): 0.33333333333333331, + Pair('Lamberteschi', 'Albizzi'): 0.33333333333333331, + Pair('Peruzzi', 'Guadagni'): 0.25, + Pair('Strozzi', 'Medici'): 0.16666666666666666, + Pair('Pazzi', 'Medici'): 0.16666666666666666, + Pair('Ridolfi', 'Albizzi'): 0.33333333333333331, + Pair('Tornabuoni', 'Lamberteschi'): 0.33333333333333331, + Pair('Tornabuoni', 'Salviati'): 0.33333333333333331, + Pair('Ridolfi', 'Acciaiuoli'): 0.33333333333333331, + Pair('Strozzi', 'Guadagni'): 0.25, + Pair('Salviati', 'Acciaiuoli'): 0.5, + Pair('Guadagni', 'Ginori'): 0.25, + Pair('Strozzi', 'Barbadori'): 0.25, + Pair('Peruzzi', 'Barbadori'): 0.33333333333333331, + Pair('Tornabuoni', 'Ridolfi'): 0.33333333333333331, + Pair('Albizzi', 'Acciaiuoli'): 0.33333333333333331, + Pair('Tornabuoni', 'Medici'): 0.16666666666666666, + Pair('Ridolfi', 'Medici'): 0.16666666666666666, + Pair('Peruzzi', 'Castellani'): 0.33333333333333331, + Pair('Tornabuoni', 'Strozzi'): 0.25, + Pair('Tornabuoni', 'Bischeri'): 0.33333333333333331, + Pair('Barbadori', 'Albizzi'): 0.33333333333333331, + Pair('Castellani', 'Bischeri'): 0.66666666666666663, + Pair('Ridolfi', 'Guadagni'): 0.25, + Pair('Ridolfi', 'Bischeri'): 0.33333333333333331, + Pair('Ridolfi', 'Peruzzi'): 0.33333333333333331, + Pair('Medici', 'Castellani'): 0.16666666666666666, + Pair('Bischeri', 'Albizzi'): 0.33333333333333331, + Pair('Medici', 'Ginori'): 0.16666666666666666, + Pair('Salviati', 'Ridolfi'): 0.33333333333333331, + Pair('Tornabuoni', 'Barbadori'): 0.33333333333333331, + Pair('Strozzi', 'Castellani'): 0.25, + Pair('Salviati', 'Barbadori'): 0.5, + Pair('Strozzi', 'Peruzzi'): 0.5, + Pair('Strozzi', 'Bischeri'): 0.25, + Pair('Tornabuoni', 'Albizzi'): 0.66666666666666663, + Pair('Barbadori', 'Acciaiuoli'): 0.5, + Pair('Ridolfi', 'Castellani'): 0.33333333333333331, + Pair('Tornabuoni', 'Acciaiuoli'): 0.33333333333333331} + assert_dict_equal(MaxOverlap(self.G).predict(), answer) + + def test_max_overlap_weighted(self): + pass + + def test_min_overlap(self): + answer = { + Pair('Ridolfi', 'Barbadori'): 0.5, + Pair('Medici', 'Guadagni'): 0.5, + Pair('Peruzzi', 'Bischeri'): 0.33333333333333331, + Pair('Lamberteschi', 'Bischeri'): 1.0, + Pair('Salviati', 'Albizzi'): 0.5, + Pair('Lamberteschi', 'Albizzi'): 1.0, + Pair('Peruzzi', 'Guadagni'): 0.33333333333333331, + Pair('Strozzi', 'Medici'): 0.25, + Pair('Pazzi', 'Medici'): 1.0, + Pair('Ridolfi', 'Albizzi'): 0.33333333333333331, + Pair('Tornabuoni', 'Lamberteschi'): 1.0, + Pair('Tornabuoni', 'Salviati'): 0.5, + Pair('Ridolfi', 'Acciaiuoli'): 1.0, + Pair('Strozzi', 'Guadagni'): 0.25, + Pair('Salviati', 'Acciaiuoli'): 1.0, + Pair('Guadagni', 'Ginori'): 1.0, + Pair('Strozzi', 'Barbadori'): 0.5, + Pair('Peruzzi', 'Barbadori'): 0.5, + Pair('Tornabuoni', 'Ridolfi'): 0.33333333333333331, + Pair('Albizzi', 'Acciaiuoli'): 1.0, + Pair('Tornabuoni', 'Medici'): 0.33333333333333331, + Pair('Ridolfi', 'Medici'): 0.33333333333333331, + Pair('Peruzzi', 'Castellani'): 0.33333333333333331, + Pair('Tornabuoni', 'Strozzi'): 0.33333333333333331, + Pair('Tornabuoni', 'Bischeri'): 0.33333333333333331, + Pair('Barbadori', 'Albizzi'): 0.5, + Pair('Castellani', 'Bischeri'): 0.66666666666666663, + Pair('Ridolfi', 'Guadagni'): 0.33333333333333331, + Pair('Ridolfi', 'Bischeri'): 0.33333333333333331, + Pair('Ridolfi', 'Peruzzi'): 0.33333333333333331, + Pair('Medici', 'Castellani'): 0.33333333333333331, + Pair('Bischeri', 'Albizzi'): 0.33333333333333331, + Pair('Medici', 'Ginori'): 1.0, + Pair('Salviati', 'Ridolfi'): 0.5, + Pair('Tornabuoni', 'Barbadori'): 0.5, + Pair('Strozzi', 'Castellani'): 0.33333333333333331, + Pair('Salviati', 'Barbadori'): 0.5, + Pair('Strozzi', 'Peruzzi'): 0.66666666666666663, + Pair('Strozzi', 'Bischeri'): 0.33333333333333331, + Pair('Tornabuoni', 'Albizzi'): 0.66666666666666663, + Pair('Barbadori', 'Acciaiuoli'): 1.0, + Pair('Ridolfi', 'Castellani'): 0.33333333333333331, + Pair('Tornabuoni', 'Acciaiuoli'): 1.0} + assert_dict_equal(MinOverlap(self.G).predict(), answer) + + def test_min_overlap_weighted(self): + pass + + def test_pearson(self): + answer = { + Pair('Medici', 'Guadagni'): 0.091287092917527679, + Pair('Peruzzi', 'Bischeri'): 0.15151515151515152, + Pair('Lamberteschi', 'Bischeri'): 0.53108500454379437, + Pair('Salviati', 'Albizzi'): 0.28426762180748061, + Pair('Lamberteschi', 'Albizzi'): 0.53108500454379437, + Pair('Peruzzi', 'Guadagni'): 0.055048188256318034, + Pair('Strozzi', 'Barbadori'): 0.19364916731037085, + Pair('Pazzi', 'Medici'): 0.32025630761017432, + Pair('Ridolfi', 'Albizzi'): 0.15151515151515152, + Pair('Tornabuoni', 'Lamberteschi'): 0.53108500454379437, + Pair('Tornabuoni', 'Salviati'): 0.28426762180748061, + Pair('Ridolfi', 'Acciaiuoli'): 0.53108500454379437, + Pair('Barbadori', 'Acciaiuoli'): 0.67936622048675754, + Pair('Guadagni', 'Ginori'): 0.43852900965351466, + Pair('Peruzzi', 'Barbadori'): 0.28426762180748061, + Pair('Tornabuoni', 'Ridolfi'): 0.15151515151515152, + Pair('Albizzi', 'Acciaiuoli'): 0.53108500454379437, + Pair('Strozzi', 'Castellani'): 0.055048188256318034, + Pair('Ridolfi', 'Barbadori'): 0.28426762180748061, + Pair('Peruzzi', 'Castellani'): 0.15151515151515152, + Pair('Tornabuoni', 'Strozzi'): 0.055048188256318034, + Pair('Tornabuoni', 'Bischeri'): 0.15151515151515152, + Pair('Barbadori', 'Albizzi'): 0.28426762180748061, + Pair('Castellani', 'Bischeri'): 0.5757575757575758, + Pair('Ridolfi', 'Guadagni'): 0.055048188256318034, + Pair('Ridolfi', 'Bischeri'): 0.15151515151515152, + Pair('Ridolfi', 'Peruzzi'): 0.15151515151515152, + Pair('Bischeri', 'Albizzi'): 0.15151515151515152, + Pair('Medici', 'Ginori'): 0.32025630761017432, + Pair('Salviati', 'Ridolfi'): 0.28426762180748061, + Pair('Tornabuoni', 'Barbadori'): 0.28426762180748061, + Pair('Salviati', 'Acciaiuoli'): 0.67936622048675754, + Pair('Salviati', 'Barbadori'): 0.41666666666666674, + Pair('Strozzi', 'Peruzzi'): 0.44038550605054427, + Pair('Strozzi', 'Bischeri'): 0.055048188256318034, + Pair('Tornabuoni', 'Albizzi'): 0.5757575757575758, + Pair('Ridolfi', 'Castellani'): 0.15151515151515152, + Pair('Tornabuoni', 'Acciaiuoli'): 0.53108500454379437} + assert_dict_equal(Pearson(self.G).predict(), answer) + + def test_pearson_weighted(self): + pass + + def test_resource_allocation(self): + answer = { + Pair('Ridolfi', 'Barbadori'): 0.16666666666666666, + Pair('Medici', 'Guadagni'): 0.66666666666666663, + Pair('Peruzzi', 'Bischeri'): 0.25, + Pair('Lamberteschi', 'Bischeri'): 0.25, + Pair('Salviati', 'Albizzi'): 0.16666666666666666, + Pair('Lamberteschi', 'Albizzi'): 0.25, + Pair('Peruzzi', 'Guadagni'): 0.33333333333333331, + Pair('Strozzi', 'Medici'): 0.33333333333333331, + Pair('Pazzi', 'Medici'): 0.5, + Pair('Ridolfi', 'Albizzi'): 0.16666666666666666, + Pair('Tornabuoni', 'Lamberteschi'): 0.25, + Pair('Tornabuoni', 'Salviati'): 0.16666666666666666, + Pair('Ridolfi', 'Acciaiuoli'): 0.16666666666666666, + Pair('Strozzi', 'Guadagni'): 0.33333333333333331, + Pair('Salviati', 'Acciaiuoli'): 0.16666666666666666, + Pair('Guadagni', 'Ginori'): 0.33333333333333331, + Pair('Strozzi', 'Barbadori'): 0.33333333333333331, + Pair('Peruzzi', 'Barbadori'): 0.33333333333333331, + Pair('Tornabuoni', 'Ridolfi'): 0.16666666666666666, + Pair('Albizzi', 'Acciaiuoli'): 0.16666666666666666, + Pair('Tornabuoni', 'Medici'): 0.33333333333333331, + Pair('Ridolfi', 'Medici'): 0.33333333333333331, + Pair('Peruzzi', 'Castellani'): 0.25, + Pair('Tornabuoni', 'Strozzi'): 0.33333333333333331, + Pair('Tornabuoni', 'Bischeri'): 0.25, + Pair('Barbadori', 'Albizzi'): 0.16666666666666666, + Pair('Castellani', 'Bischeri'): 0.58333333333333326, + Pair('Ridolfi', 'Guadagni'): 0.33333333333333331, + Pair('Ridolfi', 'Bischeri'): 0.25, + Pair('Ridolfi', 'Peruzzi'): 0.25, + Pair('Medici', 'Castellani'): 0.5, + Pair('Bischeri', 'Albizzi'): 0.25, + Pair('Medici', 'Ginori'): 0.33333333333333331, + Pair('Salviati', 'Ridolfi'): 0.16666666666666666, + Pair('Tornabuoni', 'Barbadori'): 0.16666666666666666, + Pair('Strozzi', 'Castellani'): 0.33333333333333331, + Pair('Salviati', 'Barbadori'): 0.16666666666666666, + Pair('Strozzi', 'Peruzzi'): 0.66666666666666663, + Pair('Strozzi', 'Bischeri'): 0.33333333333333331, + Pair('Tornabuoni', 'Albizzi'): 0.41666666666666663, + Pair('Barbadori', 'Acciaiuoli'): 0.16666666666666666, + Pair('Ridolfi', 'Castellani'): 0.25, + Pair('Tornabuoni', 'Acciaiuoli'): 0.16666666666666666} + assert_dict_equal(ResourceAllocation(self.G).predict(), answer) + + def test_resource_allocation_weighted(self): + pass + + +class TestTrivialNetwork: + def setup(self): + self.G = nx.Graph() + self.G.add_edges_from([(1, 2), (1, 3), (1, 4), (2, 5), (3, 5), (4, 5)]) + nx.set_node_attributes(self.G, 'eligible', dict.fromkeys(self.G, True)) + + def test_common_neighbours(self): + expected = {Pair(1, 5): 3, Pair(2, 3): 2, Pair(3, 4): 2, Pair(2, 4): 2} + assert_dict_equal(CommonNeighbours(self.G).predict(), expected) diff --git a/linkpred/predictors/tests/test_path.py b/linkpred/predictors/tests/test_path.py new file mode 100644 index 0000000..9fd3274 --- /dev/null +++ b/linkpred/predictors/tests/test_path.py @@ -0,0 +1,27 @@ +from nose.tools import * +import networkx as nx + +from linkpred.predictors.neighbour import * + +class TestPath: + + def test_graph_distance(self): + pass + + def test_weighted_graph_distance(self): + pass + + def test_weighted_graph_distance_alpha(self): + pass + + def test_katz(self): + pass + + def test_katz_beta(self): + pass + + def test_katz_weighted(self): + pass + + def test_katz_paths_only(self): + pass diff --git a/linkpred/predictors/util.py b/linkpred/predictors/util.py new file mode 100644 index 0000000..463dca2 --- /dev/null +++ b/linkpred/predictors/util.py @@ -0,0 +1,60 @@ +from ..network import neighbourhood_search + + +def neighbourhood(G, n, k=1): + """Get k-neighbourhood of node n""" + if k == 1: + return G[n] + dist = neighbourhood_search(G, n, k) + del dist[n] + return dist.keys() + + +def neighbourhood_intersection_size(G, a, b, weight=None, k=1): + """Get the summed weight of the common neighbours of a and b + + If weighted, we use the sum of the weight products. This is equivalent + to the vector-based interpretation (dot product of the two vectors). + + """ + common_neighbours = set(neighbourhood(G, a, k)) &\ + set(neighbourhood(G, b, k)) + if weight: + w = sum(G[a][n][weight] * G[b][n][weight] + for n in common_neighbours) + else: + w = len(common_neighbours) + return w + + +def neighbourhood_size(G, u, weight=None, k=1, pow=2): + """Get the weight of the neighbours of u + + If weighted, we use the sum of the squared edge weight for compatibility + with the vector-based measures. + + """ + # The fast route for default options + if weight is None and k == 1: + return len(G[u]) + # The slow route for everything else + neighbours = neighbourhood(G, u, k) + if weight: + w = sum(G[u][v][weight] ** pow for v in neighbours) + else: + w = len(neighbours) + return w + + +def neighbourhood_union_size(G, a, b, weight=None, k=1, pow=2): + """Get the weight of the neighbours union of a and b""" + a_neighbours = set(neighbourhood(G, a, k)) + b_neighbours = set(neighbourhood(G, b, k)) + if weight: + w = sum(G[a][n][weight] ** pow for n in a_neighbours) +\ + sum(G[b][n][weight] ** pow for n in b_neighbours) -\ + sum(G[a][n][weight] * G[b][n][weight] + for n in a_neighbours & b_neighbours) + else: + w = len(a_neighbours | b_neighbours) + return w diff --git a/linkpred/result.py b/linkpred/result.py new file mode 100644 index 0000000..0ff8857 --- /dev/null +++ b/linkpred/result.py @@ -0,0 +1,168 @@ +import networkx as nx +from networkx.algorithms import bipartite + +from .util import log + +__all__ = ["ResultDict", "Result", "filter_low_degree_nodes"] + + +class Result(object): + """Result represents a query result as a pathspec, a network, or both.""" + + def __init__(self, data, eligible='eligible', + project=bipartite.weighted_projected_graph): + self.eligible = eligible + if isinstance(data, nx.Graph): + if nx.is_bipartite(data): + self.pathspec = data + if self.eligible is None: + bottom = [n for n, d in data.nodes(data=True)] + else: + bottom = [n for n, d in data.nodes( + data=True) if d[self.eligible]] + self.network = project(data, bottom) + else: + self.network = data + elif isinstance(data, Result): + self.pathspec = data.pathspec + self.network = data.network + else: + raise TypeError("Unexpected data type!") + + def __iter__(self): + return iter(self.network) + + def __len__(self): + return len(self.network) + + def for_comparison(self, exclude=set()): + """Return the result in a format, suitable for comparison. + + In practice this means we return it as a set of Pairs. + + """ + from .evaluation import Pair + + exclude = set(Pair(u, v) for u, v in exclude) + return set(Pair(u, v) for u, v in self.network.edges_iter()) - exclude + + def remove_items_from(self, l): + self.network.remove_nodes_from(l) + try: + self.pathspec.remove_nodes_from(l) + except AttributeError: + pass + + def add_remove_random_edges(self, pct_to_remove=None, pct_to_add=None): + from . import network + + if not pct_to_remove and not pct_to_add: + return + + # For simplicity, we do not do this for pathspecs + self.pathspec = None + + if pct_to_remove and pct_to_add: + network.add_remove_random_edges( + self.network, pct_to_add, pct_to_remove) + elif pct_to_remove: + network.remove_random_edges(self.network, pct_to_remove) + elif pct_to_add: + network.add_random_edges(self.network, pct_to_add) + + def low_degree(self, threshold): + """ + Find low-degree nodes + + Parameters + ---------- + threshold : int + Only nodes whose degree is below the threshold are retained + + """ + if self.eligible is not None: + return [n for n, d in self.network.degree_iter() + if d < threshold and self.network.node[n][self.eligible]] + else: + return [n for n, d in self.network.degree_iter() if d < threshold] + + def items_outside(self, container): + if self.eligible is not None: + return [n for n in self.network.nodes_iter() + if self.network.node[n][self.eligible] and n not in container] + else: + return [n for n in self.network.nodes_iter() if n not in container] + + +class ResultDict(dict): + """A dict of Results, along with some methods for manipulating them.""" + + def merge(self, mergespec, skipzero=True, weight='weight'): + """Merge Results according to mergespec into new Result with given name + + Parameters + ---------- + mergespec : a dict + dictionary of result names and their weight + (more weight = more importance) + + skipzero : True|False + If an entry in the mergespec has zero weight, skip it + (default: True) + + weight : string + Edge attribute for edge weight + + Returns + ------- + A networkx.Graph instance + + """ + log.logger.info("Merging...") + g = nx.Graph() + for resultname, resultweight in mergespec.iteritems(): + if resultweight == 0 and skipzero: + continue + result = self[resultname].network + # We also copy node data, so that 'eligible' keywords are retained + # in the merged network. + g.add_nodes_from(result.nodes(data=True)) + for u, v, edgedata in result.edges_iter(data=True): + w = edgedata[weight] * resultweight + if g.has_edge(u, v): + g.edge[u][v][weight] += w + else: + g.add_edge(u, v, attr_dict={weight: w}) + log.logger.info("Finished merging.") + return g + + def filter_all_low_degree_nodes(self, minimum=1): + networks = self.values() + filter_low_degree_nodes(networks, minimum) + + +def filter_low_degree_nodes(results, minimum=1): + """ + Only retain nodes that occur in all networks with at least a degree of k + + Changes are made in place. + + Arguments + --------- + networks : a list or iterable of networkx.Graph instances + + minimum : int + minimum node degree + + """ + log.logger.info("Filtering low degree nodes...") + for res in results: + to_remove = res.low_degree(minimum) + res.remove_items_from(to_remove) + log.logger.info("Removed %d items" % len(to_remove)) + common = set.intersection(*[set(res) for res in results]) + for res in results: + to_remove = res.items_outside(common) + res.remove_items_from(to_remove) + log.logger.info("Removed %d items" % len(to_remove)) + log.logger.info("Finished filtering low degree nodes.") diff --git a/linkpred/tests/test_result.py b/linkpred/tests/test_result.py new file mode 100644 index 0000000..b10e80e --- /dev/null +++ b/linkpred/tests/test_result.py @@ -0,0 +1,42 @@ +from nose.tools import * + +import networkx as nx +from linkpred.result import * + +class TestResult: + def setup(self): + self.B = nx.bipartite_random_graph(50, 60, 0.2) + nodes = [n for n in self.B if self.B.node[n]['bipartite']] + self.G = nx.bipartite.weighted_projected_graph(self.B, nodes) + + def test_result_init(self): + res = Result(self.B, eligible='bipartite') + assert_equal(len(res), len(self.G)) + assert_equal(len(res.network), len(self.G)) + assert_equal(len(res.pathspec), len(self.B)) + + res = Result(self.G) + assert_equal(len(res), len(self.G)) + assert_equal(len(res.network), len(self.G)) + with assert_raises(AttributeError): + res.pathspec + + def test_result_remove_items(self): + res = Result(self.B, eligible='bipartite') + # the bottom nodes (bipartite=True) start from 50. + res.remove_items_from(range(50, 60)) + assert_equal(len(res), 50) + +def test_filter_low_degree_nodes(): + B1 = nx.bipartite_random_graph(50, 60, 0.2) + B2 = nx.bipartite_random_graph(50, 60, 0.2) + res1 = Result(B1, eligible='bipartite') + res2 = Result(B2, eligible='bipartite') + + filter_low_degree_nodes([res1, res2]) + assert_less_equal(len(res1), 60) + assert_less_equal(len(res2), 60) + assert_equal(len(res1), len([n for n in res1.pathspec\ + if res1.pathspec.node[n]['bipartite']])) + assert_equal(len(res2), len([n for n in res2.pathspec\ + if res1.pathspec.node[n]['bipartite']])) diff --git a/linkpred/util/__init__.py b/linkpred/util/__init__.py new file mode 100644 index 0000000..ed00929 --- /dev/null +++ b/linkpred/util/__init__.py @@ -0,0 +1,114 @@ +import re +import sys + + +def all_pairs(l): + """Return list of all possible pairs in l""" + try: + from itertools import combinations + return combinations(l, 2) + except ImportError: + return (tuple(sorted((x, y))) for i, x in enumerate(l, start=1) + for y in l[:i] if x != y) + + +def slugify(value): + """ + Normalize string to 'slug' + + Converts to lowercase, removes non-alpha characters, + and converts spaces to hyphens. + + Taken from http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename-in-python/295466#295466 + + """ + import unicodedata + value = unicodedata.normalize( + 'NFKD', unicode(value)).encode('ascii', 'ignore') + value = unicode(re.sub('[^\w\s-]', '', value).strip().lower()) + return unicode(re.sub('[-\s]+', '-', value)) + + +def progressbar(it, prefix="", size=60): + """Show progress bar + + Taken from http://code.activestate.com/recipes/576986-progress-bar-for-console-programs-as-iterator/ + + """ + count = len(it) + + def _show(_i): + x = int(size * _i / count) + sys.stdout.write( + "%s[%s%s] %i/%i\r" % (prefix, "#" * x, "." * (size - x), + _i, count)) + sys.stdout.flush() + + _show(0) + for i, item in enumerate(it, start=1): + yield item + _show(i) + sys.stdout.write("\n") + sys.stdout.flush() + + +def load_function(functionname): + """Return the function given by functionname + + This loads function names of the form + + """ + try: + # Find rightmost point. Everything to the left is module name. + index = functionname.rindex('.') + modulename = functionname[:index] + except ValueError: + raise Exception("No module name given in " + functionname) + # Dynamically load module and function + __import__(modulename) + module = sys.modules[modulename] + function = getattr(module, functionname[index + 1:]) + return function + + +def ensure_dir(fname): + """Make sure all the intermediate directories exist for given file name""" + import os + + d = os.path.dirname(fname) + if not os.path.isdir(d): + os.makedirs(d) + + +def interpolate(l): + """Make curve l decrease.""" + l.reverse() + for i in xrange(len(l) - 1): + if l[i] >= l[i + 1]: + l[i + 1] = l[i] + l.reverse() + return l + + +def itersubclasses(cls, _seen=None): + """Generator over all subclasses of a given class, in depth first order. + + Source: + http://code.activestate.com/recipes/576949-find-all-subclasses-of-a-given-class/ + + """ + if not isinstance(cls, type): + raise TypeError('itersubclasses must be called with ' + 'new-style classes, not %.100r' % cls) + if _seen is None: + _seen = set() + try: + subs = cls.__subclasses__() + except TypeError: # fails only when cls is type + subs = cls.__subclasses__(cls) + for sub in subs: + if sub not in _seen: + _seen.add(sub) + yield sub + for sub in itersubclasses(sub, _seen): + yield sub diff --git a/linkpred/util/log.py b/linkpred/util/log.py new file mode 100644 index 0000000..05fe33a --- /dev/null +++ b/linkpred/util/log.py @@ -0,0 +1,25 @@ +import logging +import sys + +logger = logging.getLogger('linkpred') +streamhandler = logging.StreamHandler(sys.stdout) +formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s", + "%H:%M:%S") +streamhandler.setFormatter(formatter) +logger.setLevel(logging.INFO) +logger.addHandler(streamhandler) + + +def called_by(n=0): + """Returns caller of current function, useful for debugging + + Example + ------- + >>> def foo(): + .... from linkpred.util import log + .... log.logger.debug("Called by %s, %s, l. %s" % log.called_by(2)) + + """ + f = sys._getframe(n) + c = f.f_code + return c.co_filename, c.co_name, f.f_lineno diff --git a/scripts/linkpred b/scripts/linkpred new file mode 100644 index 0000000..58f93e9 --- /dev/null +++ b/scripts/linkpred @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +"""linkpred main script""" + +import os +import sys + +# LIBDIR trick start (marker for removal on platforms that don't need it) +# This trick is inspired by Nikola +# . +# It allows running from a direct checkout as well as a 'properly' installed +# package. +libdir = '@LIBDIR@' + +# Two cases: +if libdir != '@' 'LIBDIR' '@': + # Changed by our distutils hook, then use the given path. + + if not os.path.isabs(libdir): + libdir = os.path.join(os.path.dirname( + os.path.realpath(__file__)), libdir) + libdir = os.path.abspath(libdir) +else: + # Unchanged, running from checkout, + # use the parent directory, the linkpred package ought be there. + libdir = os.path.join(os.path.dirname(__file__), "..") + +sys.path.insert(0, libdir) + +if "PYTHONPATH" not in os.environ: + os.environ["PYTHONPATH"] = libdir +else: + os.environ["PYTHONPATH"] = os.environ["PYTHONPATH"] + ":" + libdir + +# LIBDIR trick end (marker for removal on platforms that don't need it) + +from linkpred.cli import get_profile +from linkpred.core import training_test_data, predict, evaluate +from linkpred.evaluation import DataSet + + +def main(): + profile = get_profile(choose_interpolation=True, choose_filetype=True) + only_new = profile.get("only_new", False) + filetype = profile.get('filetype', 'pdf') + steps = profile.get('steps', 1) + interpolate = profile.get('interpolation', True) + label = profile['training']['name'] + + training, test = training_test_data(profile) + predictions = predict(training, profile, only_new, eligible="eligible") + exclude = set(training.network.edges_iter()) if only_new else set() + dataset = DataSet(label, predictions, test, exclude=exclude, steps=steps) + evaluate(dataset, label, filetype, interpolate, steps) + +if __name__ == "__main__": + main()