Skip to content

RGDA1 for RDFlib #441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Dec 11, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
2e115b8
First pass at a traces implementation, not full pruning yet
Nov 16, 2014
dad0677
Final bugfixes and a round of performance improvements
Nov 16, 2014
cdc3d6a
naive pruning of automorphisms
Nov 16, 2014
368f352
added benchmark and related instrumentation
Nov 18, 2014
d911bc4
further benchmark refinements
Nov 20, 2014
0419fa4
added multithreading to the benchmark.
Nov 21, 2014
a69c22a
switched to multiprocessing
Nov 21, 2014
86e324e
serial loading of ontologies.
Nov 21, 2014
ceae542
throttling the bioportal downloads to a max of 4 connections.
Nov 21, 2014
0f8adec
Forgot to actually put the finished tasks back out of the queue.
Nov 21, 2014
c3ede71
Forgot to actually put the finished tasks back out of the queue.
Nov 21, 2014
ea8ea14
more automorphism detection
Nov 30, 2014
110a0d4
More tests working, but it looks like JSON-LD doesn't do safe roundtr…
Dec 7, 2014
8be0ad2
Support for python 2.6 and hopefully 3.x
Dec 7, 2014
3ad72fa
Another unicode tweak, works locally on python 2.7.7 (anaconda)
Dec 7, 2014
52121a5
.n3() always returns unicode (right?) and the only other possible thi…
Dec 7, 2014
b29e0b0
Updated comments and citations, removed badly performing equality mea…
Dec 8, 2014
8e78708
minor: code style guides
joernhees Dec 8, 2014
4be615c
Privatized some decorators and utility functions.
Dec 8, 2014
a4f2999
Merge remote-tracking branch 'joernhees/canonicalization' into canoni…
Dec 9, 2014
200c226
Fixed up some of the style adjustments to pass tests.
Dec 9, 2014
26c2755
misnamed performance decorator
Dec 9, 2014
bf20429
removed unintended singleton
Dec 10, 2014
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
423 changes: 423 additions & 0 deletions benchmarks/analysis.ipynb

Large diffs are not rendered by default.

162 changes: 162 additions & 0 deletions examples/graph_digest_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#!/usr/bin/env python

'''
This benchmark will produce graph digests for all of the
downloadable ontologies available in Bioportal.
'''

from rdflib import *
from rdflib.compare import to_isomorphic
import sys, csv
from urllib import *
from io import StringIO
from collections import defaultdict
from urllib2 import urlopen

from multiprocessing import *
from Queue import Empty

bioportal_query = '''
PREFIX metadata: <http://data.bioontology.org/metadata/>

select distinct ?ontology ?title ?download where {
?ontology a metadata:Ontology;
metadata:omvname ?title;
metadata:links ?links.
?links metadata:Ontology ?download.
filter(regex(?download, "/download"))
}
'''

stat_cols = [
'id',
'ontology',
'download_url',
'tree_depth',
'color_count',
'individuations',
'prunings',
'initial_color_count',
'adjacent_nodes',
'initial_coloring_runtime',
'triple_count',
'graph_digest',
'to_hash_runtime',
'canonicalize_triples_runtime',
'error',
]


def files_benchmark(ontologies, output_file, threads):
w = open(output_file, 'w')
writer = csv.DictWriter(w, stat_cols)
writer.writeheader()
tasks = Queue()
finished_tasks = Queue()
dl_lock = Semaphore(4)
task_count = len(ontologies)

def worker(q, finished_tasks, dl_lock):
try:
while True:
stats = q.get()
og = Graph()
try:
og.load(stats['download_url'])
print stats['ontology'], stats['id']
ig = to_isomorphic(og)
graph_digest = ig.graph_digest(stats)
finished_tasks.put(stats)
except Exception as e:
print 'ERROR', stats['id'], e
stats['error'] = str(e)
finished_tasks.put(stats)
except Empty:
pass
for i in range(int(threads)):
print "Starting worker", i
t = Process(target=worker, args=[tasks, finished_tasks, dl_lock])
t.daemon = True
t.start()
for download in ontologies:
stats = defaultdict(str)
stats.update({
"id": download.split("/")[-1].split(".")[0],
"ontology": download.split("/")[-1].split(".")[0],
"download_url": download
})
tasks.put(stats)
tasks.close()
written_tasks = 0
while written_tasks < task_count:
stats = finished_tasks.get()
# print "Writing", stats['ontology']
writer.writerow(stats)
w.flush()
written_tasks += 1


def bioportal_benchmark(apikey, output_file, threads):
metadata = Namespace("http://data.bioontology.org/metadata/")
url = 'http://data.bioontology.org/ontologies?apikey=%s' % apikey
ontology_graph = Graph()
print url
ontology_list_json = urlopen(url).read()
ontology_graph.parse(StringIO(unicode(ontology_list_json)), format="json-ld")
ontologies = ontology_graph.query(bioportal_query)
w = open(output_file, 'w')
writer = csv.DictWriter(w, stat_cols)
writer.writeheader()
tasks = Queue()
finished_tasks = Queue()
dl_lock = Semaphore(4)
task_count = len(ontologies)

def worker(q, finished_tasks, dl_lock):
try:
while True:
stats = q.get()
og = Graph()
try:
try:
dl_lock.acquire()
og.load(stats['download_url'] + "?apikey=%s" % apikey)
finally:
dl_lock.release()
print stats['ontology'], stats['id']
ig = to_isomorphic(og)
graph_digest = ig.graph_digest(stats)
finished_tasks.put(stats)
except Exception as e:
print 'ERROR', stats['id'], e
stats['error'] = str(e)
finished_tasks.put(stats)
except Empty:
pass
for i in range(int(threads)):
print "Starting worker", i
t = Process(target=worker, args=[tasks, finished_tasks, dl_lock])
t.daemon = True
t.start()
for ontology, title, download in ontologies:
stats = defaultdict(str)
stats.update({
"id": ontology,
"ontology": title,
"download_url": download
})
tasks.put(stats)
tasks.close()
written_tasks = 0
while written_tasks < task_count:
stats = finished_tasks.get()
# print "Writing", stats['ontology']
writer.writerow(stats)
w.flush()
written_tasks += 1

if __name__ == '__main__':
if len(sys.argv) > 4:
files_benchmark(sys.argv[1:-2], sys.argv[-2], sys.argv[-1])
else:
bioportal_benchmark(sys.argv[1], sys.argv[2], sys.argv[3])
Loading