Skip to content
This repository was archived by the owner on Oct 1, 2024. It is now read-only.

Commit 6956289

Browse files
committed
load a services.Dataset into a Neo4J instance
1 parent 2be9056 commit 6956289

File tree

6 files changed

+190
-1
lines changed

6 files changed

+190
-1
lines changed

.env.template

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ POSTGRES_DB=glossary
55
API_KEY="123456789"
66
DATABASE_URL="postgresql+psycopg://admin:123456789@postgres:5432/glossary"
77
SENTRY_DSN="https://…"
8+
NEO4J_AUTH=

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
start-neo4j:
2+
docker compose -f docker-compose.neo4j.yml --env-file .env up -d
3+
4+
stop-neo4j:
5+
docker compose -f docker-compose.neo4j.yml down
6+

dds_glossary/neo.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
import logging
2+
from typing import Any, Dict, List, Tuple
3+
4+
from . import services
5+
6+
import neo4j
7+
from owlready2 import get_ontology
8+
9+
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
10+
11+
12+
def load_services_dataset(driver: neo4j.Driver, dataset: services.Dataset, dataset_path: str, import_chunk_size: int=1000):
13+
"""
14+
Load a given dataset into a Neo4J instance
15+
"""
16+
ontology = get_ontology(dataset.url).load()
17+
ontology.save(file=str(dataset_path), format="rdfxml")
18+
concept_schemes_raw, concepts_raw, _, semantic_relations_raw = services.GlossaryController.parse_dataset(dataset_path)
19+
20+
logging.info("Loaded the dataset in memory")
21+
22+
concept_schemes = [c.to_dict() for c in concept_schemes_raw]
23+
concepts = [c.to_dict() for c in concepts_raw]
24+
semantic_relations = [c.to_dict() for c in semantic_relations_raw]
25+
26+
# One service.Dataset has exactly one concept scheme
27+
main_concept_scheme = concept_schemes[0]
28+
29+
logging.info("Importing concept schemes...")
30+
31+
# Load concept schemes
32+
# for chunk in chunk_list(concept_schemes, import_chunk_size):
33+
# query, args = build_nodes_import_query_and_args(["ConceptScheme"], chunk)
34+
# driver.execute_query(query, args)
35+
36+
logging.info("Imported concept schemes")
37+
38+
logging.info("Importing concepts...")
39+
40+
# Load concepts
41+
# for chunk in chunk_list(concepts, import_chunk_size):
42+
# query, args = build_nodes_import_query_and_args(["Concept"], chunk)
43+
# driver.execute_query(query, args)
44+
45+
logging.info("Imported concept schemes")
46+
47+
logging.info("Adding indices...")
48+
49+
# Index by the IRI
50+
for key, label in [("ConceptScheme", "iri"), ("Concept", "iri")]:
51+
driver.execute_query(build_index_query(key=key, label=label))
52+
53+
logging.info("Added indices...")
54+
55+
logging.info("Importing concept -> concept scheme relationships...")
56+
57+
# Load concept -> concept scheme relationships
58+
for concept in concepts:
59+
edge_type = "IsFrom"
60+
edge = ("Concept", concept["iri"], "ConceptScheme", main_concept_scheme["iri"])
61+
query, args = build_edges_import_query_and_args([edge_type], [edge])
62+
driver.execute_query(query, args)
63+
64+
logging.info("Imported concept -> concept scheme relationships")
65+
66+
logging.info("Importing concept -> concept 'broader' relationships...")
67+
68+
# Load concept "broader" their relationships
69+
for semantic_relation in semantic_relations:
70+
edge_type = semantic_relations[0]["type"]
71+
72+
edge = ("Concept", semantic_relation["source_concept_iri"], "Concept", semantic_relation["target_concept_iri"])
73+
74+
query, args = build_edges_import_query_and_args([edge_type], [edge])
75+
driver.execute_query(query, args)
76+
77+
logging.info("Imported concept -> concept 'broader' relationships")
78+
79+
80+
def build_nodes_import_query_and_args(labels: List[str], nodes: List[Dict[str, Any]]):
81+
"""
82+
Bulk import nodes into Neo4J
83+
84+
## Example
85+
86+
> build_nodes_import_query_and_args(["Hello", "World"], [{"a": 1, "b": 2}, {"a": 1, "c": 10}])
87+
(
88+
"MERGE (e_0:Hello:World) {a: $a_0, b: $b_0}\nMERGE (e_1:Hello:World) {a: $a_1, c: $c_1}",
89+
{'a_0': 1, 'b_0': 2, 'a_1': 1, 'c_1': 10}
90+
)
91+
"""
92+
query_args = {}
93+
for idx, node in enumerate(nodes):
94+
for k, v in node.items():
95+
query_args[f"{k}_{idx}"] = v
96+
97+
schema_keys = set()
98+
for node in nodes:
99+
for k in node.keys():
100+
schema_keys.add(k)
101+
102+
node_labels_str = ':'.join(labels)
103+
104+
query_rows = []
105+
for idx, node in enumerate(nodes):
106+
schema_kv = [f"{k}: ${k}_{idx}" for k in node.keys()]
107+
query_row = f"MERGE (e_{idx}:{node_labels_str} {{{', '.join(schema_kv)}}})"
108+
query_rows.append(query_row)
109+
110+
query = "\n".join(query_rows)
111+
return query, query_args
112+
113+
114+
def build_edges_import_query_and_args(labels: List[str], edges: List[Tuple[str, str, str, str]]):
115+
"""
116+
Bulk import nodes into Neo4J
117+
118+
## Example
119+
120+
> build_edges_import_query_and_args(["IsFrom"], [("Concept", "def", "ConceptScheme", "abcd")])
121+
(
122+
"MATCH (src_0:Concept {iri: $iri_src_0}), (tgt_0: ConceptScheme {iri: $iri_tgt_0})\nWITH src_0, tgt_0\nMERGE (src_0)-[r_0:IsFrom]->(tgt_0)",
123+
{'iri_src_0': 'def', 'iri_tgt_0': 'abcd'}
124+
)
125+
"""
126+
query_args = {}
127+
for idx, edge in enumerate(edges):
128+
_, source_iri, _, target_iri = edge
129+
query_args[f"iri_src_{idx}"] = source_iri
130+
query_args[f"iri_tgt_{idx}"] = target_iri
131+
132+
edge_labels_str = ":".join(labels)
133+
134+
matches = []
135+
withs = []
136+
merges = []
137+
for idx, edge in enumerate(edges):
138+
source_label, source_iri, target_label, target_iri = edge
139+
matches.extend([
140+
f"(src_{idx}:{source_label} {{iri: $iri_src_{idx}}})",
141+
f"(tgt_{idx}:{target_label} {{iri: $iri_tgt_{idx}}})"
142+
])
143+
withs.extend([f"src_{idx}", f"tgt_{idx}"])
144+
merges.extend([f"(src_{idx})-[r_{idx}:{edge_labels_str}]->(tgt_{idx})"])
145+
146+
query = f"""
147+
MATCH {', '.join(matches)}
148+
"""
149+
for merge in merges:
150+
query += f"MERGE {merge}"
151+
152+
return query, query_args
153+
154+
155+
def build_index_query(label: str, key: str):
156+
"""
157+
Build indices for a list of keys on labels
158+
"""
159+
return f"CREATE INDEX {label}_{key}_index IF NOT EXISTS FOR (c:{label}) ON (c.{key})"
160+
161+
162+
def chunk_list(lst: list, n: int):
163+
"""
164+
Yield successive n-sized chunks from list `lst`.
165+
"""
166+
for i in range(0, len(lst), n):
167+
yield lst[i:i + n]

dds_glossary/services.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,8 @@ def get_scheme_members(
118118
"""
119119
return [member for member in members if member.member_type == member_type]
120120

121+
@staticmethod
121122
def parse_dataset(
122-
self,
123123
dataset_path: Path,
124124
) -> tuple[
125125
list[ConceptScheme],

docker-compose.neo4j.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
services:
2+
neo4j:
3+
image: "neo4j:5.20.0-community-bullseye"
4+
volumes:
5+
- neo4j_data:/data
6+
- neo4j_logs:/logs
7+
ports:
8+
- 7474:7474
9+
- 7687:7687
10+
environment:
11+
- NEO4J_AUTH=${NEO4J_AUTH}
12+
volumes:
13+
neo4j_data:
14+
neo4j_logs:

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ dependencies = [
3939
"SQLAlchemy>=2.0.0",
4040
"sqlalchemy-utils",
4141
"uvicorn[standard]",
42+
"neo4j"
4243
]
4344

4445
[project.urls]

0 commit comments

Comments
 (0)