Skip to content

Commit b513e5c

Browse files
authored
Merge pull request #12 from sage-org/develop
Develop
2 parents bf9072d + 2aa33ae commit b513e5c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+4288
-2380
lines changed

poetry.lock

-597
This file was deleted.

pyproject.toml

+30-16
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
[tool.poetry]
22
name = "sage-engine"
3-
version = "2.2.0"
4-
description = "Sage: a SPARQL query engine for public Linked Data providers"
3+
version = "2.3.0"
4+
description = "Sage: a preemptive SPARQL query engine for online Knowledge Graphs"
55
repository = "https://github.com/sage-org/sage-engine"
6-
authors = [ "Thomas Minier <tminier01@gmail.com>" ]
6+
authors = [ "Thomas Minier <tminier01@gmail.com>",
7+
"Julien Aimonier-Davat <julien.aimonier-davat@univ-nantes.fr>",
8+
"Pascal Molli <pascal.molli@univ-nantes.fr>",
9+
"Hala Skaf <hala.skaf@univ-nantes.fr>" ]
710
keywords = [ "rdf", "sparql", "query engine" ]
811
classifiers = [
912
"Topic :: Database :: Database Engines/Servers",
@@ -20,35 +23,46 @@ exclude = [ "tests" ]
2023

2124
[tool.poetry.scripts]
2225
sage = "sage.cli.http_server:start_sage_server"
26+
sage-query = "sage.cli.commons:sage_query"
27+
sage-debug= "sage.cli.debug:sage_query_debug"
2328
sage-grpc = "sage.cli.grpc_server:start_grpc_server"
2429
sage-postgres-init = "sage.cli.postgres:init_postgres"
2530
sage-postgres-index = "sage.cli.postgres:index_postgres"
2631
sage-postgres-put = "sage.cli.postgres:put_postgres"
32+
sage-sqlite-init = "sage.cli.sqlite:init_sqlite"
33+
sage-sqlite-index = "sage.cli.sqlite:index_sqlite"
34+
sage-sqlite-put = "sage.cli.sqlite:put_sqlite"
35+
sage-hbase-init = "sage.cli.hbase:init_hbase"
36+
sage-hbase-put = "sage.cli.hbase:put_hbase"
2737

2838
[tool.poetry.dependencies]
2939
python = "^3.7"
30-
uvloop = "0.14.0"
31-
PyYAML = "5.1.2"
32-
rdflib = "4.2.2"
33-
rdflib-jsonld = "0.4.0"
34-
protobuf = "3.11.0"
35-
click = "7.0"
36-
fastapi = "0.44.1"
37-
uvicorn = "0.10.8"
38-
grpcio = "^1.26"
40+
uvloop = "0.15.2"
41+
PyYAML = "5.4.1"
42+
rdflib = "5.0.0"
43+
rdflib-jsonld = "0.5.0"
44+
protobuf = "3.15.7"
45+
click = "7.1.2"
46+
fastapi = "0.63.0"
47+
uvicorn = "0.13.4"
48+
grpcio = "^1.36"
49+
coloredlogs="15.0"
50+
pylru="^1.0"
3951
# optional dependencies
4052
pybind11 = { version = "2.2.4", optional = true }
4153
hdt = { version = "2.3", optional = true }
42-
psycopg2-binary = { version = "2.7.7", optional = true }
54+
psycopg2-binary = { version = "2.8.6", optional = true }
55+
happybase = { version = "1.2.0", optional = true }
4356

4457
[tool.poetry.extras]
4558
hdt = ["pybind11", "hdt"]
4659
postgres = ["psycopg2-binary"]
60+
hbase = ["happybase"]
4761

4862
[tool.poetry.dev-dependencies]
49-
pytest = "^5.3"
50-
pytest-asyncio = "^0.10.0"
51-
requests = "^2.22"
63+
pytest = "^6.2"
64+
pytest-asyncio = "^O.14"
65+
requests = "^2.25"
5266
sphinx = "^2.3"
5367
sphinx_rtd_theme = "^0.4.3"
5468

sage/cli/commons.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def sage_query(entrypoint, default_graph_uri, query, file, format, limit):
3535

3636
# prepare query headers
3737
headers = {
38-
"accept": "application/sparql-results+json",
38+
"accept": "text/html",
3939
"content-type": "application/json",
4040
"next": None
4141
}
@@ -52,10 +52,10 @@ def sage_query(entrypoint, default_graph_uri, query, file, format, limit):
5252
while has_next and count < limit:
5353
response = requests.post(entrypoint, headers=headers, data=dumps(payload))
5454
json_response = response.json()
55-
has_next = json_response["head"]['hasNext']
56-
payload["next"] = json_response["head"]["next"]
57-
for bindings in json_response["results"]['bindings']:
58-
print(bindings)
55+
has_next = json_response['next']
56+
payload['next'] = json_response['next']
57+
for bindings in json_response['bindings']:
58+
print(str(bindings))
5959
count += 1
6060
if count >= limit:
6161
break

sage/cli/debug.py

+102
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# commons.py
2+
# Author: Thomas MINIER - MIT License 2017-2019
3+
# Author: Pascal Molli - MIT License 2017-2019
4+
5+
from rdflib import BNode, Literal, URIRef, Variable
6+
from rdflib import Graph
7+
8+
from starlette.testclient import TestClient
9+
from tests.http.utils import post_sparql
10+
11+
from sage.http_server.server import run_app
12+
from sage.query_engine.optimizer.query_parser import parse_query
13+
from sage.database.core.yaml_config import load_config
14+
from sage.query_engine.sage_engine import SageEngine
15+
16+
import click
17+
import requests
18+
from json import dumps
19+
from math import inf
20+
from sys import exit
21+
import json
22+
import asyncio
23+
import uvloop
24+
25+
import coloredlogs
26+
import logging
27+
from time import time
28+
29+
# install logger
30+
coloredlogs.install(level='INFO', fmt='%(asctime)s - %(levelname)s %(message)s')
31+
logger = logging.getLogger(__name__)
32+
33+
34+
# (results, saved, done, _) = await engine.execute(scan, 10e7)
35+
async def execute(engine,iterator,limit):
36+
# try:
37+
while iterator.has_next():
38+
value = await iterator.next()
39+
print(value)
40+
# except:
41+
# print("error in debug/execute")
42+
43+
# try:
44+
# (results, saved, done, _) = await engine.execute(iterator, 10e7)
45+
# for r in results:
46+
# print(str(r))
47+
# except StopAsyncIteration:
48+
# pass
49+
50+
51+
@click.command()
52+
@click.argument("config_file")
53+
@click.argument("default_graph_uri")
54+
@click.option("-q", "--query", type=str, default=None, help="SPARQL query to execute (passed in command-line)")
55+
@click.option("-f", "--file", type=str, default=None, help="File containing a SPARQL query to execute")
56+
@click.option("-l", "--limit", type=int, default=None, help="Maximum number of solutions bindings to fetch, similar to the SPARQL LIMIT modifier.")
57+
def sage_query_debug(config_file, default_graph_uri, query, file, limit ):
58+
"""
59+
debug a SPARQL query on an embedded Sage Server.
60+
61+
Example usage: sage-query config.yaml http://example.org/swdf-postgres -f queries/spo.sparql
62+
"""
63+
# assert that we have a query to evaluate
64+
if query is None and file is None:
65+
print("Error: you must specificy a query to execute, either with --query or --file. See sage-query --help for more informations.")
66+
exit(1)
67+
68+
## setting the log level of the asyncio logger to logging.DEBUG, for example the following snippet of code can be run at startup of the application:
69+
#logging.basicConfig(level=logging.WARNING)
70+
logging.basicConfig(level=logging.DEBUG)
71+
72+
if limit is None:
73+
limit = inf
74+
75+
# load query from file if required
76+
if file is not None:
77+
with open(file) as query_file:
78+
query = query_file.read()
79+
80+
dataset = load_config(config_file)
81+
if dataset is None:
82+
print("config file {config_file} not found")
83+
exit(1)
84+
graph = dataset.get_graph(default_graph_uri)
85+
if graph is None:
86+
print("RDF Graph not found:"+default_graph_uri)
87+
exit(1)
88+
engine = SageEngine()
89+
cards = list()
90+
context=dict()
91+
context['quantum']=1000000
92+
context['max_results']=1000000
93+
from time import time
94+
context['start_timestamp']=time()
95+
iterator,cards = parse_query(query, dataset, default_graph_uri,context)
96+
loop = asyncio.get_event_loop()
97+
loop.run_until_complete(execute(engine,iterator,limit))
98+
loop.close()
99+
100+
101+
if __name__ == '__main__':
102+
sage_query_debug()

sage/cli/hbase.py

+128
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# cli.py
2+
# Author: Thomas MINIER - MIT License 2019
3+
import happybase
4+
import click
5+
import coloredlogs
6+
import logging
7+
8+
from sys import exit
9+
from os.path import isfile
10+
from yaml import load
11+
from hdt import HDTDocument
12+
from time import time
13+
from sage.database.hbase.utils import build_row_key
14+
from sage.cli.utils import load_graph, get_nb_triples
15+
from sage.cli.parsers import ParserFactory, ParseError
16+
17+
coloredlogs.install(level='INFO', fmt='%(asctime)s - %(levelname)s %(message)s')
18+
logger = logging.getLogger(__name__)
19+
20+
21+
@click.command()
22+
@click.argument("config")
23+
@click.argument("graph_name")
24+
def init_hbase(config, graph_name):
25+
"""
26+
Initialize the RDF Graph DATASET_NAME with a Apache HBase backend, described in the configuration file CONFIG.
27+
"""
28+
# load graph from config file
29+
graph, kind = load_graph(config, graph_name, logger, backends=['hbase'])
30+
31+
thrift_port = graph['thrift_port'] if 'thrift_port' in graph else 9090
32+
33+
logger.info("Connexion to the HBase server...")
34+
connection = happybase.Connection(graph['thrift_host'], protocol="compact", transport="framed", port=thrift_port, table_prefix=graph_name)
35+
logger.info("Connected to the HBase server !")
36+
37+
# create HBase tables
38+
families = {'rdf': dict()}
39+
logger.info("Creating HBase tables for RDF Graph '{}'...".format(graph_name))
40+
connection.create_table('spo', families)
41+
connection.create_table('pos', families)
42+
connection.create_table('osp', families)
43+
logger.info("RDF Graph '{}' successfully created in HBase".format(graph_name))
44+
connection.close()
45+
46+
47+
@click.command()
48+
@click.argument("rdf_file")
49+
@click.argument("config")
50+
@click.argument("graph_name")
51+
@click.option("-f", "--format", type=click.Choice(["nt", "hdt"]),
52+
default="nt", show_default=True,
53+
help="Format of the input file. Supported: nt (N-triples) and hdt (HDT).")
54+
@click.option("-b", "--batch-size", type=int, default=1000, show_default=True,
55+
help="Batch size used for batch loading")
56+
def put_hbase(rdf_file, config, graph_name, format, batch_size):
57+
"""
58+
Insert RDF triples from HDT file HDT_FILE into the RDF Graph graph_name, described in the configuration file CONFIG. The dataset must use the Apache HBase backend.
59+
"""
60+
# load graph from config file
61+
graph, kind = load_graph(config, graph_name, logger, backends=['hbase'])
62+
63+
thrift_port = graph['thrift_port'] if 'thrift_port' in graph else 9090
64+
65+
logger.info("Connexion to the HBase server...")
66+
connection = happybase.Connection(graph['thrift_host'], protocol="compact", transport="framed", port=thrift_port, table_prefix=graph_name)
67+
logger.info("Connected to the HBase server !")
68+
69+
spo_batch = connection.table('spo').batch(batch_size=batch_size)
70+
pos_batch = connection.table('pos').batch(batch_size=batch_size)
71+
osp_batch = connection.table('osp').batch(batch_size=batch_size)
72+
73+
logger.info("Reading RDF source file...")
74+
nb_triples = get_nb_triples(rdf_file, format)
75+
logger.info(f"Found ~{nb_triples} RDF triples to ingest.")
76+
77+
start = time()
78+
inserted = 0
79+
dropped = 0
80+
81+
with click.progressbar(length=nb_triples, label=f"Inserting RDF triples 0/{nb_triples} - {dropped} triples dropped.") as bar:
82+
83+
def on_bucket(bucket):
84+
nonlocal inserted, dropped
85+
for (s, p, o) in bucket:
86+
columns = {
87+
b'rdf:subject': s.encode('utf-8'),
88+
b'rdf:predicate': p.encode('utf-8'),
89+
b'rdf:object': o.encode('utf-8')
90+
}
91+
spo_key = build_row_key(s, p, o)
92+
pos_key = build_row_key(p, o, s)
93+
osp_key = build_row_key(o, s, p)
94+
spo_batch.put(spo_key, columns)
95+
pos_batch.put(pos_key, columns)
96+
osp_batch.put(osp_key, columns)
97+
inserted = inserted + len(bucket)
98+
bar.label = f"Inserting RDF triples {inserted}/{nb_triples} - {dropped} triples dropped."
99+
bar.update(len(bucket))
100+
101+
def on_error(error):
102+
nonlocal dropped, inserted
103+
if isinstance(error, ParseError):
104+
logger.warning(error)
105+
dropped = dropped + 1
106+
bar.label = f"Inserting RDF triples {inserted}/{nb_triples} - {dropped} triples dropped."
107+
bar.update(0)
108+
else:
109+
logger.error(error)
110+
exit(1)
111+
112+
def on_complete():
113+
nonlocal start
114+
# send last batch
115+
spo_batch.send()
116+
pos_batch.send()
117+
osp_batch.send()
118+
logger.info(f"RDF triples ingestion successfully completed in {time() - start}s")
119+
logger.info("Committing and cleaning up...")
120+
connection.close()
121+
logger.info(f"RDF data from file '{rdf_file}' successfully inserted into RDF graph '{graph_name}'")
122+
123+
logger.info("Starting RDF triples ingestion...")
124+
parser = ParserFactory.create_parser(format, batch_size)
125+
parser.on_bucket = on_bucket
126+
parser.on_error = on_error
127+
parser.on_complete = on_complete
128+
parser.parsefile(rdf_file)

0 commit comments

Comments
 (0)