diff --git a/llm/ms_graphrag_import.ipynb b/llm/ms_graphrag_import.ipynb
new file mode 100644
index 0000000..f5334a5
--- /dev/null
+++ b/llm/ms_graphrag_import.ipynb
@@ -0,0 +1,1030 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "3eeee95f-e4f2-4052-94fb-a5dc8ab542ae",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n",
+ " from pandas.core import (\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "from neo4j import GraphDatabase\n",
+ "import time"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "b6c15443-4acb-4f91-88ea-4e08abaa4c29",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "NEO4J_URI=\"bolt://localhost\"\n",
+ "NEO4J_USERNAME=\"neo4j\"\n",
+ "NEO4J_PASSWORD=\"password\"\n",
+ "\n",
+ "driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "d787bf7b-ac9b-4bfb-b140-a50a3fd205c5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def batched_import(statement, df, batch_size=1000):\n",
+ " total = len(df)\n",
+ " start_s = time.time()\n",
+ " for start in range(0,total, batch_size):\n",
+ " batch = df.iloc[start: min(start+batch_size,total)]\n",
+ " result = driver.execute_query(\"UNWIND $rows AS value \" + statement, \n",
+ " rows=batch.to_dict('records'))\n",
+ " print(result.summary.counters)\n",
+ " print(f'{total} rows in { time.time() - start_s} s.') \n",
+ " return total"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "ed7f212e-9148-424c-adc6-d81db9f8e5a5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique\n",
+ "\n",
+ "create constraint document_id if not exists for (d:__Document__) require d.id is unique\n",
+ "\n",
+ "create constraint entity_id if not exists for (c:__Community__) require c.community is unique\n",
+ "\n",
+ "create constraint entity_id if not exists for (e:__Entity__) require e.id is unique\n",
+ "\n",
+ "create constraint entity_title if not exists for (e:__Entity__) require e.title is unique\n",
+ "\n",
+ "create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique\n",
+ "\n",
+ "create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique\n"
+ ]
+ }
+ ],
+ "source": [
+ "# create constraints\n",
+ "\n",
+ "statements = \"\"\"\n",
+ "create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;\n",
+ "create constraint document_id if not exists for (d:__Document__) require d.id is unique;\n",
+ "create constraint entity_id if not exists for (c:__Community__) require c.community is unique;\n",
+ "create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;\n",
+ "create constraint entity_title if not exists for (e:__Entity__) require e.title is unique;\n",
+ "create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique;\n",
+ "create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;\n",
+ "\"\"\".split(\";\")\n",
+ "\n",
+ "for s in statements:\n",
+ " if len((s or \"\").strip()) > 0:\n",
+ " print(s)\n",
+ " driver.execute_query(query_=s)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "5ec93c92-499d-4ec6-bf3b-c34f74552600",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " text | \n",
+ " n_tokens | \n",
+ " document_ids | \n",
+ " entity_ids | \n",
+ " relationship_ids | \n",
+ " covariate_ids | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2cf7a230c367a2dfaf0fc3c903eb8948 | \n",
+ " # Operation: Dulce\\n\\n## Chapter 1\\n\\nThe thru... | \n",
+ " 2500 | \n",
+ " [958fdd043f17ade63cb13570b59df295] | \n",
+ " [b45241d70f0e43fca764df95b2b81f77, 4119fd06010... | \n",
+ " [b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870... | \n",
+ " [ad5a2020-cdec-4982-acdf-dbe5ee530066, 9d8a0fe... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 6d1255303acb7c9dc951cb0f5fc3042c | \n",
+ " be the same.\\n\\n\\*\\n\\nThe sense of foreboding... | \n",
+ " 2500 | \n",
+ " [958fdd043f17ade63cb13570b59df295] | \n",
+ " [b45241d70f0e43fca764df95b2b81f77, 4119fd06010... | \n",
+ " [b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870... | \n",
+ " [5d1c9126-c48d-4755-9f9c-f739c823f95f, ec64a42... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id \\\n",
+ "0 2cf7a230c367a2dfaf0fc3c903eb8948 \n",
+ "1 6d1255303acb7c9dc951cb0f5fc3042c \n",
+ "\n",
+ " text n_tokens \\\n",
+ "0 # Operation: Dulce\\n\\n## Chapter 1\\n\\nThe thru... 2500 \n",
+ "1 be the same.\\n\\n\\*\\n\\nThe sense of foreboding... 2500 \n",
+ "\n",
+ " document_ids \\\n",
+ "0 [958fdd043f17ade63cb13570b59df295] \n",
+ "1 [958fdd043f17ade63cb13570b59df295] \n",
+ "\n",
+ " entity_ids \\\n",
+ "0 [b45241d70f0e43fca764df95b2b81f77, 4119fd06010... \n",
+ "1 [b45241d70f0e43fca764df95b2b81f77, 4119fd06010... \n",
+ "\n",
+ " relationship_ids \\\n",
+ "0 [b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870... \n",
+ "1 [b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870... \n",
+ "\n",
+ " covariate_ids \n",
+ "0 [ad5a2020-cdec-4982-acdf-dbe5ee530066, 9d8a0fe... \n",
+ "1 [5d1c9126-c48d-4755-9f9c-f739c823f95f, ec64a42... "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "text_df = pd.read_parquet('create_final_text_units.parquet')\n",
+ "text_df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "ffd3d380-8710-46f5-b90a-04ed8482192c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'_contains_updates': True, 'labels_added': 13, 'relationships_created': 12, 'nodes_created': 13, 'properties_set': 37}\n",
+ "12 rows in 0.08599472045898438 s.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "12"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "statement = \"\"\"\n",
+ "MERGE (n:__Chunk__ {id:value.id})\n",
+ "SET n += value {.text, .n_tokens}\n",
+ "WITH n, value\n",
+ "UNWIND value.document_ids AS document\n",
+ "MERGE (d:__Document__ {id:document})\n",
+ "MERGE (n)-[:PART_OF_DOCUMENT]->(d)\n",
+ "\"\"\"\n",
+ "batched_import(statement, text_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "140b420e-045e-4c71-9f25-1a20c5b528bd",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " type | \n",
+ " description | \n",
+ " human_readable_id | \n",
+ " graph_embedding | \n",
+ " text_unit_ids | \n",
+ " description_embedding | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " b45241d70f0e43fca764df95b2b81f77 | \n",
+ " ALEX MERCER | \n",
+ " PERSON | \n",
+ " Alex Mercer is a character with a military bac... | \n",
+ " 0 | \n",
+ " None | \n",
+ " [00fafabae48948779fee2afe600f5143, 1e433d6b308... | \n",
+ " [0.009358493611216545, -0.02407047711312771, -... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 4119fd06010c494caa07f439b333f4c5 | \n",
+ " TAYLOR CRUZ | \n",
+ " PERSON | \n",
+ " Taylor Cruz is a character who plays a pivotal... | \n",
+ " 1 | \n",
+ " None | \n",
+ " [00fafabae48948779fee2afe600f5143, 1e433d6b308... | \n",
+ " [0.0020127426832914352, -0.027186712250113487,... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name type \\\n",
+ "0 b45241d70f0e43fca764df95b2b81f77 ALEX MERCER PERSON \n",
+ "1 4119fd06010c494caa07f439b333f4c5 TAYLOR CRUZ PERSON \n",
+ "\n",
+ " description human_readable_id \\\n",
+ "0 Alex Mercer is a character with a military bac... 0 \n",
+ "1 Taylor Cruz is a character who plays a pivotal... 1 \n",
+ "\n",
+ " graph_embedding text_unit_ids \\\n",
+ "0 None [00fafabae48948779fee2afe600f5143, 1e433d6b308... \n",
+ "1 None [00fafabae48948779fee2afe600f5143, 1e433d6b308... \n",
+ "\n",
+ " description_embedding \n",
+ "0 [0.009358493611216545, -0.02407047711312771, -... \n",
+ "1 [0.0020127426832914352, -0.027186712250113487,... "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "entity_df = pd.read_parquet('create_final_entities.parquet')\n",
+ "entity_df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "1d038114-0714-48ee-a48a-c421cd539661",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'_contains_updates': True, 'labels_added': 217, 'relationships_created': 307, 'nodes_created': 217, 'properties_set': 1085}\n",
+ "217 rows in 0.37180399894714355 s.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "217"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "entity_statement = \"\"\"\n",
+ "MERGE (n:__Entity__ {id:value.id})\n",
+ "SET n += value {.human_readable_id, .description, name:replace(value.name,'\"',''), .description_embedding}\n",
+ "WITH n, value\n",
+ "CALL apoc.create.addLabels(n, case when value.type is null OR value.type = \"\" then [] else [apoc.text.upperCamelCase(replace(value.type,'\"',''))] end) yield node\n",
+ "UNWIND value.text_unit_ids AS text_unit\n",
+ "MERGE (c:__Chunk__ {id:text_unit})\n",
+ "MERGE (c)-[:MENTIONS]->(n)\n",
+ "RETURN count(*)\n",
+ "\"\"\"\n",
+ "batched_import(entity_statement, entity_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "5e713603-c508-4964-ba49-474e4867b747",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " source | \n",
+ " target | \n",
+ " weight | \n",
+ " description | \n",
+ " text_unit_ids | \n",
+ " id | \n",
+ " human_readable_id | \n",
+ " source_degree | \n",
+ " target_degree | \n",
+ " rank | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ALEX MERCER | \n",
+ " TAYLOR CRUZ | \n",
+ " 7.0 | \n",
+ " Alex Mercer and Taylor Cruz are integral membe... | \n",
+ " [00fafabae48948779fee2afe600f5143, 1e433d6b308... | \n",
+ " b35c3d1a7daa4924b6bdb58bc69c354d | \n",
+ " 0 | \n",
+ " 9 | \n",
+ " 12 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ALEX MERCER | \n",
+ " TAYLOR CRUZ | \n",
+ " 7.0 | \n",
+ " Alex Mercer and Taylor Cruz are integral membe... | \n",
+ " [00fafabae48948779fee2afe600f5143, 1e433d6b308... | \n",
+ " b35c3d1a7daa4924b6bdb58bc69c354d | \n",
+ " 0 | \n",
+ " 9 | \n",
+ " 12 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " source target weight \\\n",
+ "0 ALEX MERCER TAYLOR CRUZ 7.0 \n",
+ "1 ALEX MERCER TAYLOR CRUZ 7.0 \n",
+ "\n",
+ " description \\\n",
+ "0 Alex Mercer and Taylor Cruz are integral membe... \n",
+ "1 Alex Mercer and Taylor Cruz are integral membe... \n",
+ "\n",
+ " text_unit_ids \\\n",
+ "0 [00fafabae48948779fee2afe600f5143, 1e433d6b308... \n",
+ "1 [00fafabae48948779fee2afe600f5143, 1e433d6b308... \n",
+ "\n",
+ " id human_readable_id source_degree \\\n",
+ "0 b35c3d1a7daa4924b6bdb58bc69c354d 0 9 \n",
+ "1 b35c3d1a7daa4924b6bdb58bc69c354d 0 9 \n",
+ "\n",
+ " target_degree rank \n",
+ "0 12 21 \n",
+ "1 12 21 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rel_df = pd.read_parquet('create_final_relationships.parquet')\n",
+ "rel_df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "27900c01-89e1-4dec-9d5c-c07317c68baf",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'_contains_updates': True, 'relationships_created': 69, 'properties_set': 1449}\n",
+ "276 rows in 0.1078798770904541 s.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "276"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rel_statement = \"\"\"\n",
+ " MATCH (source:__Entity__ {name:replace(value.source,'\"','')})\n",
+ " MATCH (target:__Entity__ {name:replace(value.target,'\"','')})\n",
+ " // not necessary to merge on id as there is only one relationship per pair\n",
+ " MERGE (source)-[rel:RELATED {id: value.id}]->(target)\n",
+ " SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}\n",
+ " RETURN count(*) as createdRels\n",
+ "\"\"\"\n",
+ "batched_import(rel_statement, rel_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "1be9e7a9-69ee-406b-bce5-95a9c41ecffe",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " community | \n",
+ " full_content | \n",
+ " level | \n",
+ " rank | \n",
+ " title | \n",
+ " rank_explanation | \n",
+ " summary | \n",
+ " findings | \n",
+ " full_content_json | \n",
+ " id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 4 | \n",
+ " # Dulce Base and the Paranormal Military Squad... | \n",
+ " 1 | \n",
+ " 8.5 | \n",
+ " Dulce Base and the Paranormal Military Squad: ... | \n",
+ " The impact severity rating is high due to the ... | \n",
+ " The community is centered around Dulce Base, a... | \n",
+ " [{'explanation': 'Dulce Base is the primary lo... | \n",
+ " {\\n \"title\": \"Dulce Base and the Paranormal... | \n",
+ " 6f8ba6b6-506e-46c1-83ce-982d59622554 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 5 | \n",
+ " # Sam Rivera and the Paranormal Military Squad... | \n",
+ " 1 | \n",
+ " 7.5 | \n",
+ " Sam Rivera and the Paranormal Military Squad a... | \n",
+ " The impact severity rating is high due to the ... | \n",
+ " The community is centered around Sam Rivera, a... | \n",
+ " [{'explanation': 'Sam Rivera is recognized for... | \n",
+ " {\\n \"title\": \"Sam Rivera and the Paranormal... | \n",
+ " 418f4536-d673-4212-8a7c-ca1aac547d0f | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " community full_content level rank \\\n",
+ "0 4 # Dulce Base and the Paranormal Military Squad... 1 8.5 \n",
+ "1 5 # Sam Rivera and the Paranormal Military Squad... 1 7.5 \n",
+ "\n",
+ " title \\\n",
+ "0 Dulce Base and the Paranormal Military Squad: ... \n",
+ "1 Sam Rivera and the Paranormal Military Squad a... \n",
+ "\n",
+ " rank_explanation \\\n",
+ "0 The impact severity rating is high due to the ... \n",
+ "1 The impact severity rating is high due to the ... \n",
+ "\n",
+ " summary \\\n",
+ "0 The community is centered around Dulce Base, a... \n",
+ "1 The community is centered around Sam Rivera, a... \n",
+ "\n",
+ " findings \\\n",
+ "0 [{'explanation': 'Dulce Base is the primary lo... \n",
+ "1 [{'explanation': 'Sam Rivera is recognized for... \n",
+ "\n",
+ " full_content_json \\\n",
+ "0 {\\n \"title\": \"Dulce Base and the Paranormal... \n",
+ "1 {\\n \"title\": \"Sam Rivera and the Paranormal... \n",
+ "\n",
+ " id \n",
+ "0 6f8ba6b6-506e-46c1-83ce-982d59622554 \n",
+ "1 418f4536-d673-4212-8a7c-ca1aac547d0f "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "community_df = pd.read_parquet('create_final_community_reports.parquet')\n",
+ "community_df.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "5c6ed591-f98c-4403-9fde-8d4cb4c01cca",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'_contains_updates': True, 'labels_added': 37, 'relationships_created': 31, 'nodes_created': 37, 'properties_set': 110}\n",
+ "6 rows in 0.05302619934082031 s.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "6"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# import communities\n",
+ "# Run only once / not idempotent\n",
+ "community_statement = \"\"\"\n",
+ "MERGE (c:__Community__ {id:value.id})\n",
+ "SET c += value {.community, .level, .title, .rank, .rank_explanation, .full_content, .summary}\n",
+ "WITH c, value\n",
+ "UNWIND value.findings AS finding\n",
+ "CREATE (c)-[:HAS_FINDING]->(f:Finding)\n",
+ "SET f += finding\n",
+ "\"\"\"\n",
+ "batched_import(community_statement, community_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "523bed92-d12c-4fc4-aa44-6c62321b36bc",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " human_readable_id | \n",
+ " covariate_type | \n",
+ " type | \n",
+ " description | \n",
+ " subject_id | \n",
+ " subject_type | \n",
+ " object_id | \n",
+ " object_type | \n",
+ " status | \n",
+ " start_date | \n",
+ " end_date | \n",
+ " source_text | \n",
+ " text_unit_id | \n",
+ " document_ids | \n",
+ " n_tokens | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ad5a2020-cdec-4982-acdf-dbe5ee530066 | \n",
+ " 1 | \n",
+ " claim | \n",
+ " MISSION INVOLVEMENT | \n",
+ " Agent Alex Mercer's compliance in the briefing... | \n",
+ " AGENT ALEX MERCER | \n",
+ " None | \n",
+ " NONE | \n",
+ " None | \n",
+ " SUSPECTED | \n",
+ " NONE | \n",
+ " NONE | \n",
+ " \"With dulled eyes, he scanned the projectors o... | \n",
+ " 2cf7a230c367a2dfaf0fc3c903eb8948 | \n",
+ " [958fdd043f17ade63cb13570b59df295] | \n",
+ " 2500 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 9d8a0fe5-07b7-4b1a-b5be-1317d0fac005 | \n",
+ " 2 | \n",
+ " claim | \n",
+ " AUTHORITY EXERCISE | \n",
+ " Agent Taylor Cruz exercises authority and dema... | \n",
+ " AGENT TAYLOR CRUZ | \n",
+ " None | \n",
+ " NONE | \n",
+ " None | \n",
+ " TRUE | \n",
+ " NONE | \n",
+ " NONE | \n",
+ " \"It was Taylor Cruz’s voice, laced with an edg... | \n",
+ " 2cf7a230c367a2dfaf0fc3c903eb8948 | \n",
+ " [958fdd043f17ade63cb13570b59df295] | \n",
+ " 2500 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id human_readable_id covariate_type \\\n",
+ "0 ad5a2020-cdec-4982-acdf-dbe5ee530066 1 claim \n",
+ "1 9d8a0fe5-07b7-4b1a-b5be-1317d0fac005 2 claim \n",
+ "\n",
+ " type description \\\n",
+ "0 MISSION INVOLVEMENT Agent Alex Mercer's compliance in the briefing... \n",
+ "1 AUTHORITY EXERCISE Agent Taylor Cruz exercises authority and dema... \n",
+ "\n",
+ " subject_id subject_type object_id object_type status start_date \\\n",
+ "0 AGENT ALEX MERCER None NONE None SUSPECTED NONE \n",
+ "1 AGENT TAYLOR CRUZ None NONE None TRUE NONE \n",
+ "\n",
+ " end_date source_text \\\n",
+ "0 NONE \"With dulled eyes, he scanned the projectors o... \n",
+ "1 NONE \"It was Taylor Cruz’s voice, laced with an edg... \n",
+ "\n",
+ " text_unit_id document_ids \\\n",
+ "0 2cf7a230c367a2dfaf0fc3c903eb8948 [958fdd043f17ade63cb13570b59df295] \n",
+ "1 2cf7a230c367a2dfaf0fc3c903eb8948 [958fdd043f17ade63cb13570b59df295] \n",
+ "\n",
+ " n_tokens \n",
+ "0 2500 \n",
+ "1 2500 "
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cov_df = pd.read_parquet('create_final_covariates.parquet')\n",
+ "cov_df.head(2)\n",
+ "# Subject id do not match entity ids"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "3e064234-5fce-448e-8bb4-ab2f35699049",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'_contains_updates': True, 'labels_added': 89, 'relationships_created': 89, 'nodes_created': 89, 'properties_set': 1061}\n",
+ "89 rows in 0.13370895385742188 s.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "89"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# import covariates\n",
+ "cov_statement = \"\"\"\n",
+ "MERGE (c:__Covariate__ {id:value.id})\n",
+ "SET c += apoc.map.clean(value, [\"text_unit_id\", \"document_ids\", \"n_tokens\"], [Null, \"\"])\n",
+ "WITH c, value\n",
+ "MATCH (ch:__Chunk__ {id: value.text_unit_id})\n",
+ "MERGE (ch)-[:HAS_COVARIATE]->(c)\n",
+ "\"\"\"\n",
+ "batched_import(cov_statement, cov_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "fc9f6606-0cce-4f28-9d88-eaf894d8110b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " level | \n",
+ " title | \n",
+ " type | \n",
+ " description | \n",
+ " source_id | \n",
+ " community | \n",
+ " degree | \n",
+ " human_readable_id | \n",
+ " id | \n",
+ " size | \n",
+ " graph_embedding | \n",
+ " entity_type | \n",
+ " top_level_node_id | \n",
+ " x | \n",
+ " y | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ALEX MERCER | \n",
+ " PERSON | \n",
+ " Alex Mercer is a character with a military bac... | \n",
+ " 00fafabae48948779fee2afe600f5143,1e433d6b30887... | \n",
+ " 1 | \n",
+ " 9 | \n",
+ " 0 | \n",
+ " b45241d70f0e43fca764df95b2b81f77 | \n",
+ " 9 | \n",
+ " None | \n",
+ " None | \n",
+ " b45241d70f0e43fca764df95b2b81f77 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 217 | \n",
+ " 1 | \n",
+ " ALEX MERCER | \n",
+ " PERSON | \n",
+ " Alex Mercer is a character with a military bac... | \n",
+ " 00fafabae48948779fee2afe600f5143,1e433d6b30887... | \n",
+ " 4 | \n",
+ " 9 | \n",
+ " 0 | \n",
+ " b45241d70f0e43fca764df95b2b81f77 | \n",
+ " 9 | \n",
+ " None | \n",
+ " None | \n",
+ " b45241d70f0e43fca764df95b2b81f77 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " level title type \\\n",
+ "0 0 ALEX MERCER PERSON \n",
+ "217 1 ALEX MERCER PERSON \n",
+ "\n",
+ " description \\\n",
+ "0 Alex Mercer is a character with a military bac... \n",
+ "217 Alex Mercer is a character with a military bac... \n",
+ "\n",
+ " source_id community degree \\\n",
+ "0 00fafabae48948779fee2afe600f5143,1e433d6b30887... 1 9 \n",
+ "217 00fafabae48948779fee2afe600f5143,1e433d6b30887... 4 9 \n",
+ "\n",
+ " human_readable_id id size \\\n",
+ "0 0 b45241d70f0e43fca764df95b2b81f77 9 \n",
+ "217 0 b45241d70f0e43fca764df95b2b81f77 9 \n",
+ "\n",
+ " graph_embedding entity_type top_level_node_id x y \n",
+ "0 None None b45241d70f0e43fca764df95b2b81f77 0 0 \n",
+ "217 None None b45241d70f0e43fca764df95b2b81f77 0 0 "
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "nodes_df = pd.read_parquet('create_final_nodes.parquet')\n",
+ "nodes_df[nodes_df['title'] == 'ALEX MERCER']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "47bb6f5c-4c1c-4849-8f1a-cb76fa98b925",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "community\n",
+ "1 14\n",
+ "2 9\n",
+ "4 9\n",
+ "0 6\n",
+ "5 5\n",
+ "3 3\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "nodes_df.community.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "dde636a4-a876-4d30-b1a2-8124023c14ed",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{}\n",
+ "46 rows in 0.06763219833374023 s.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "46"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Connect nodes to first level community\n",
+ "first_df = nodes_df[nodes_df['community'].notna()]\n",
+ "first_statement = \"\"\"\n",
+ "MATCH (c:__Entity__ {name:replace(value.title,'\"','')})\n",
+ "MATCH (c1:__Community__ {community: value.community})\n",
+ "MERGE (c)-[:IN_COMMUNITY]->(c1)\n",
+ "RETURN count(distinct c1)\n",
+ "\"\"\"\n",
+ "batched_import(first_statement, first_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7131f3a0-2b71-4017-9dcd-24913d964dc0",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}