diff --git a/llm/ms_graphrag_import.ipynb b/llm/ms_graphrag_import.ipynb new file mode 100644 index 0000000..f5334a5 --- /dev/null +++ b/llm/ms_graphrag_import.ipynb @@ -0,0 +1,1030 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3eeee95f-e4f2-4052-94fb-a5dc8ab542ae", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n", + " from pandas.core import (\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from neo4j import GraphDatabase\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b6c15443-4acb-4f91-88ea-4e08abaa4c29", + "metadata": {}, + "outputs": [], + "source": [ + "NEO4J_URI=\"bolt://localhost\"\n", + "NEO4J_USERNAME=\"neo4j\"\n", + "NEO4J_PASSWORD=\"password\"\n", + "\n", + "driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d787bf7b-ac9b-4bfb-b140-a50a3fd205c5", + "metadata": {}, + "outputs": [], + "source": [ + "def batched_import(statement, df, batch_size=1000):\n", + " total = len(df)\n", + " start_s = time.time()\n", + " for start in range(0,total, batch_size):\n", + " batch = df.iloc[start: min(start+batch_size,total)]\n", + " result = driver.execute_query(\"UNWIND $rows AS value \" + statement, \n", + " rows=batch.to_dict('records'))\n", + " print(result.summary.counters)\n", + " print(f'{total} rows in { time.time() - start_s} s.') \n", + " return total" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ed7f212e-9148-424c-adc6-d81db9f8e5a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique\n", + "\n", + "create constraint document_id if not exists for (d:__Document__) require d.id is unique\n", + "\n", + "create constraint entity_id if not exists for (c:__Community__) require c.community is unique\n", + "\n", + "create constraint entity_id if not exists for (e:__Entity__) require e.id is unique\n", + "\n", + "create constraint entity_title if not exists for (e:__Entity__) require e.title is unique\n", + "\n", + "create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique\n", + "\n", + "create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique\n" + ] + } + ], + "source": [ + "# create constraints\n", + "\n", + "statements = \"\"\"\n", + "create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;\n", + "create constraint document_id if not exists for (d:__Document__) require d.id is unique;\n", + "create constraint entity_id if not exists for (c:__Community__) require c.community is unique;\n", + "create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;\n", + "create constraint entity_title if not exists for (e:__Entity__) require e.title is unique;\n", + "create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique;\n", + "create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;\n", + "\"\"\".split(\";\")\n", + "\n", + "for s in statements:\n", + " if len((s or \"\").strip()) > 0:\n", + " print(s)\n", + " driver.execute_query(query_=s)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5ec93c92-499d-4ec6-bf3b-c34f74552600", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtextn_tokensdocument_idsentity_idsrelationship_idscovariate_ids
02cf7a230c367a2dfaf0fc3c903eb8948# Operation: Dulce\\n\\n## Chapter 1\\n\\nThe thru...2500[958fdd043f17ade63cb13570b59df295][b45241d70f0e43fca764df95b2b81f77, 4119fd06010...[b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870...[ad5a2020-cdec-4982-acdf-dbe5ee530066, 9d8a0fe...
16d1255303acb7c9dc951cb0f5fc3042cbe the same.\\n\\n\\*\\n\\nThe sense of foreboding...2500[958fdd043f17ade63cb13570b59df295][b45241d70f0e43fca764df95b2b81f77, 4119fd06010...[b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870...[5d1c9126-c48d-4755-9f9c-f739c823f95f, ec64a42...
\n", + "
" + ], + "text/plain": [ + " id \\\n", + "0 2cf7a230c367a2dfaf0fc3c903eb8948 \n", + "1 6d1255303acb7c9dc951cb0f5fc3042c \n", + "\n", + " text n_tokens \\\n", + "0 # Operation: Dulce\\n\\n## Chapter 1\\n\\nThe thru... 2500 \n", + "1 be the same.\\n\\n\\*\\n\\nThe sense of foreboding... 2500 \n", + "\n", + " document_ids \\\n", + "0 [958fdd043f17ade63cb13570b59df295] \n", + "1 [958fdd043f17ade63cb13570b59df295] \n", + "\n", + " entity_ids \\\n", + "0 [b45241d70f0e43fca764df95b2b81f77, 4119fd06010... \n", + "1 [b45241d70f0e43fca764df95b2b81f77, 4119fd06010... \n", + "\n", + " relationship_ids \\\n", + "0 [b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870... \n", + "1 [b35c3d1a7daa4924b6bdb58bc69c354d, a97e2ecd870... \n", + "\n", + " covariate_ids \n", + "0 [ad5a2020-cdec-4982-acdf-dbe5ee530066, 9d8a0fe... \n", + "1 [5d1c9126-c48d-4755-9f9c-f739c823f95f, ec64a42... " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_df = pd.read_parquet('create_final_text_units.parquet')\n", + "text_df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ffd3d380-8710-46f5-b90a-04ed8482192c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_contains_updates': True, 'labels_added': 13, 'relationships_created': 12, 'nodes_created': 13, 'properties_set': 37}\n", + "12 rows in 0.08599472045898438 s.\n" + ] + }, + { + "data": { + "text/plain": [ + "12" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "statement = \"\"\"\n", + "MERGE (n:__Chunk__ {id:value.id})\n", + "SET n += value {.text, .n_tokens}\n", + "WITH n, value\n", + "UNWIND value.document_ids AS document\n", + "MERGE (d:__Document__ {id:document})\n", + "MERGE (n)-[:PART_OF_DOCUMENT]->(d)\n", + "\"\"\"\n", + "batched_import(statement, text_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "140b420e-045e-4c71-9f25-1a20c5b528bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnametypedescriptionhuman_readable_idgraph_embeddingtext_unit_idsdescription_embedding
0b45241d70f0e43fca764df95b2b81f77ALEX MERCERPERSONAlex Mercer is a character with a military bac...0None[00fafabae48948779fee2afe600f5143, 1e433d6b308...[0.009358493611216545, -0.02407047711312771, -...
14119fd06010c494caa07f439b333f4c5TAYLOR CRUZPERSONTaylor Cruz is a character who plays a pivotal...1None[00fafabae48948779fee2afe600f5143, 1e433d6b308...[0.0020127426832914352, -0.027186712250113487,...
\n", + "
" + ], + "text/plain": [ + " id name type \\\n", + "0 b45241d70f0e43fca764df95b2b81f77 ALEX MERCER PERSON \n", + "1 4119fd06010c494caa07f439b333f4c5 TAYLOR CRUZ PERSON \n", + "\n", + " description human_readable_id \\\n", + "0 Alex Mercer is a character with a military bac... 0 \n", + "1 Taylor Cruz is a character who plays a pivotal... 1 \n", + "\n", + " graph_embedding text_unit_ids \\\n", + "0 None [00fafabae48948779fee2afe600f5143, 1e433d6b308... \n", + "1 None [00fafabae48948779fee2afe600f5143, 1e433d6b308... \n", + "\n", + " description_embedding \n", + "0 [0.009358493611216545, -0.02407047711312771, -... \n", + "1 [0.0020127426832914352, -0.027186712250113487,... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "entity_df = pd.read_parquet('create_final_entities.parquet')\n", + "entity_df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1d038114-0714-48ee-a48a-c421cd539661", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_contains_updates': True, 'labels_added': 217, 'relationships_created': 307, 'nodes_created': 217, 'properties_set': 1085}\n", + "217 rows in 0.37180399894714355 s.\n" + ] + }, + { + "data": { + "text/plain": [ + "217" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "entity_statement = \"\"\"\n", + "MERGE (n:__Entity__ {id:value.id})\n", + "SET n += value {.human_readable_id, .description, name:replace(value.name,'\"',''), .description_embedding}\n", + "WITH n, value\n", + "CALL apoc.create.addLabels(n, case when value.type is null OR value.type = \"\" then [] else [apoc.text.upperCamelCase(replace(value.type,'\"',''))] end) yield node\n", + "UNWIND value.text_unit_ids AS text_unit\n", + "MERGE (c:__Chunk__ {id:text_unit})\n", + "MERGE (c)-[:MENTIONS]->(n)\n", + "RETURN count(*)\n", + "\"\"\"\n", + "batched_import(entity_statement, entity_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5e713603-c508-4964-ba49-474e4867b747", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sourcetargetweightdescriptiontext_unit_idsidhuman_readable_idsource_degreetarget_degreerank
0ALEX MERCERTAYLOR CRUZ7.0Alex Mercer and Taylor Cruz are integral membe...[00fafabae48948779fee2afe600f5143, 1e433d6b308...b35c3d1a7daa4924b6bdb58bc69c354d091221
1ALEX MERCERTAYLOR CRUZ7.0Alex Mercer and Taylor Cruz are integral membe...[00fafabae48948779fee2afe600f5143, 1e433d6b308...b35c3d1a7daa4924b6bdb58bc69c354d091221
\n", + "
" + ], + "text/plain": [ + " source target weight \\\n", + "0 ALEX MERCER TAYLOR CRUZ 7.0 \n", + "1 ALEX MERCER TAYLOR CRUZ 7.0 \n", + "\n", + " description \\\n", + "0 Alex Mercer and Taylor Cruz are integral membe... \n", + "1 Alex Mercer and Taylor Cruz are integral membe... \n", + "\n", + " text_unit_ids \\\n", + "0 [00fafabae48948779fee2afe600f5143, 1e433d6b308... \n", + "1 [00fafabae48948779fee2afe600f5143, 1e433d6b308... \n", + "\n", + " id human_readable_id source_degree \\\n", + "0 b35c3d1a7daa4924b6bdb58bc69c354d 0 9 \n", + "1 b35c3d1a7daa4924b6bdb58bc69c354d 0 9 \n", + "\n", + " target_degree rank \n", + "0 12 21 \n", + "1 12 21 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rel_df = pd.read_parquet('create_final_relationships.parquet')\n", + "rel_df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "27900c01-89e1-4dec-9d5c-c07317c68baf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_contains_updates': True, 'relationships_created': 69, 'properties_set': 1449}\n", + "276 rows in 0.1078798770904541 s.\n" + ] + }, + { + "data": { + "text/plain": [ + "276" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rel_statement = \"\"\"\n", + " MATCH (source:__Entity__ {name:replace(value.source,'\"','')})\n", + " MATCH (target:__Entity__ {name:replace(value.target,'\"','')})\n", + " // not necessary to merge on id as there is only one relationship per pair\n", + " MERGE (source)-[rel:RELATED {id: value.id}]->(target)\n", + " SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}\n", + " RETURN count(*) as createdRels\n", + "\"\"\"\n", + "batched_import(rel_statement, rel_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1be9e7a9-69ee-406b-bce5-95a9c41ecffe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
communityfull_contentlevelranktitlerank_explanationsummaryfindingsfull_content_jsonid
04# Dulce Base and the Paranormal Military Squad...18.5Dulce Base and the Paranormal Military Squad: ...The impact severity rating is high due to the ...The community is centered around Dulce Base, a...[{'explanation': 'Dulce Base is the primary lo...{\\n \"title\": \"Dulce Base and the Paranormal...6f8ba6b6-506e-46c1-83ce-982d59622554
15# Sam Rivera and the Paranormal Military Squad...17.5Sam Rivera and the Paranormal Military Squad a...The impact severity rating is high due to the ...The community is centered around Sam Rivera, a...[{'explanation': 'Sam Rivera is recognized for...{\\n \"title\": \"Sam Rivera and the Paranormal...418f4536-d673-4212-8a7c-ca1aac547d0f
\n", + "
" + ], + "text/plain": [ + " community full_content level rank \\\n", + "0 4 # Dulce Base and the Paranormal Military Squad... 1 8.5 \n", + "1 5 # Sam Rivera and the Paranormal Military Squad... 1 7.5 \n", + "\n", + " title \\\n", + "0 Dulce Base and the Paranormal Military Squad: ... \n", + "1 Sam Rivera and the Paranormal Military Squad a... \n", + "\n", + " rank_explanation \\\n", + "0 The impact severity rating is high due to the ... \n", + "1 The impact severity rating is high due to the ... \n", + "\n", + " summary \\\n", + "0 The community is centered around Dulce Base, a... \n", + "1 The community is centered around Sam Rivera, a... \n", + "\n", + " findings \\\n", + "0 [{'explanation': 'Dulce Base is the primary lo... \n", + "1 [{'explanation': 'Sam Rivera is recognized for... \n", + "\n", + " full_content_json \\\n", + "0 {\\n \"title\": \"Dulce Base and the Paranormal... \n", + "1 {\\n \"title\": \"Sam Rivera and the Paranormal... \n", + "\n", + " id \n", + "0 6f8ba6b6-506e-46c1-83ce-982d59622554 \n", + "1 418f4536-d673-4212-8a7c-ca1aac547d0f " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "community_df = pd.read_parquet('create_final_community_reports.parquet')\n", + "community_df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5c6ed591-f98c-4403-9fde-8d4cb4c01cca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_contains_updates': True, 'labels_added': 37, 'relationships_created': 31, 'nodes_created': 37, 'properties_set': 110}\n", + "6 rows in 0.05302619934082031 s.\n" + ] + }, + { + "data": { + "text/plain": [ + "6" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# import communities\n", + "# Run only once / not idempotent\n", + "community_statement = \"\"\"\n", + "MERGE (c:__Community__ {id:value.id})\n", + "SET c += value {.community, .level, .title, .rank, .rank_explanation, .full_content, .summary}\n", + "WITH c, value\n", + "UNWIND value.findings AS finding\n", + "CREATE (c)-[:HAS_FINDING]->(f:Finding)\n", + "SET f += finding\n", + "\"\"\"\n", + "batched_import(community_statement, community_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "523bed92-d12c-4fc4-aa44-6c62321b36bc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idhuman_readable_idcovariate_typetypedescriptionsubject_idsubject_typeobject_idobject_typestatusstart_dateend_datesource_texttext_unit_iddocument_idsn_tokens
0ad5a2020-cdec-4982-acdf-dbe5ee5300661claimMISSION INVOLVEMENTAgent Alex Mercer's compliance in the briefing...AGENT ALEX MERCERNoneNONENoneSUSPECTEDNONENONE\"With dulled eyes, he scanned the projectors o...2cf7a230c367a2dfaf0fc3c903eb8948[958fdd043f17ade63cb13570b59df295]2500
19d8a0fe5-07b7-4b1a-b5be-1317d0fac0052claimAUTHORITY EXERCISEAgent Taylor Cruz exercises authority and dema...AGENT TAYLOR CRUZNoneNONENoneTRUENONENONE\"It was Taylor Cruz’s voice, laced with an edg...2cf7a230c367a2dfaf0fc3c903eb8948[958fdd043f17ade63cb13570b59df295]2500
\n", + "
" + ], + "text/plain": [ + " id human_readable_id covariate_type \\\n", + "0 ad5a2020-cdec-4982-acdf-dbe5ee530066 1 claim \n", + "1 9d8a0fe5-07b7-4b1a-b5be-1317d0fac005 2 claim \n", + "\n", + " type description \\\n", + "0 MISSION INVOLVEMENT Agent Alex Mercer's compliance in the briefing... \n", + "1 AUTHORITY EXERCISE Agent Taylor Cruz exercises authority and dema... \n", + "\n", + " subject_id subject_type object_id object_type status start_date \\\n", + "0 AGENT ALEX MERCER None NONE None SUSPECTED NONE \n", + "1 AGENT TAYLOR CRUZ None NONE None TRUE NONE \n", + "\n", + " end_date source_text \\\n", + "0 NONE \"With dulled eyes, he scanned the projectors o... \n", + "1 NONE \"It was Taylor Cruz’s voice, laced with an edg... \n", + "\n", + " text_unit_id document_ids \\\n", + "0 2cf7a230c367a2dfaf0fc3c903eb8948 [958fdd043f17ade63cb13570b59df295] \n", + "1 2cf7a230c367a2dfaf0fc3c903eb8948 [958fdd043f17ade63cb13570b59df295] \n", + "\n", + " n_tokens \n", + "0 2500 \n", + "1 2500 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cov_df = pd.read_parquet('create_final_covariates.parquet')\n", + "cov_df.head(2)\n", + "# Subject id do not match entity ids" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3e064234-5fce-448e-8bb4-ab2f35699049", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'_contains_updates': True, 'labels_added': 89, 'relationships_created': 89, 'nodes_created': 89, 'properties_set': 1061}\n", + "89 rows in 0.13370895385742188 s.\n" + ] + }, + { + "data": { + "text/plain": [ + "89" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# import covariates\n", + "cov_statement = \"\"\"\n", + "MERGE (c:__Covariate__ {id:value.id})\n", + "SET c += apoc.map.clean(value, [\"text_unit_id\", \"document_ids\", \"n_tokens\"], [Null, \"\"])\n", + "WITH c, value\n", + "MATCH (ch:__Chunk__ {id: value.text_unit_id})\n", + "MERGE (ch)-[:HAS_COVARIATE]->(c)\n", + "\"\"\"\n", + "batched_import(cov_statement, cov_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "fc9f6606-0cce-4f28-9d88-eaf894d8110b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
leveltitletypedescriptionsource_idcommunitydegreehuman_readable_ididsizegraph_embeddingentity_typetop_level_node_idxy
00ALEX MERCERPERSONAlex Mercer is a character with a military bac...00fafabae48948779fee2afe600f5143,1e433d6b30887...190b45241d70f0e43fca764df95b2b81f779NoneNoneb45241d70f0e43fca764df95b2b81f7700
2171ALEX MERCERPERSONAlex Mercer is a character with a military bac...00fafabae48948779fee2afe600f5143,1e433d6b30887...490b45241d70f0e43fca764df95b2b81f779NoneNoneb45241d70f0e43fca764df95b2b81f7700
\n", + "
" + ], + "text/plain": [ + " level title type \\\n", + "0 0 ALEX MERCER PERSON \n", + "217 1 ALEX MERCER PERSON \n", + "\n", + " description \\\n", + "0 Alex Mercer is a character with a military bac... \n", + "217 Alex Mercer is a character with a military bac... \n", + "\n", + " source_id community degree \\\n", + "0 00fafabae48948779fee2afe600f5143,1e433d6b30887... 1 9 \n", + "217 00fafabae48948779fee2afe600f5143,1e433d6b30887... 4 9 \n", + "\n", + " human_readable_id id size \\\n", + "0 0 b45241d70f0e43fca764df95b2b81f77 9 \n", + "217 0 b45241d70f0e43fca764df95b2b81f77 9 \n", + "\n", + " graph_embedding entity_type top_level_node_id x y \n", + "0 None None b45241d70f0e43fca764df95b2b81f77 0 0 \n", + "217 None None b45241d70f0e43fca764df95b2b81f77 0 0 " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nodes_df = pd.read_parquet('create_final_nodes.parquet')\n", + "nodes_df[nodes_df['title'] == 'ALEX MERCER']" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "47bb6f5c-4c1c-4849-8f1a-cb76fa98b925", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "community\n", + "1 14\n", + "2 9\n", + "4 9\n", + "0 6\n", + "5 5\n", + "3 3\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nodes_df.community.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "dde636a4-a876-4d30-b1a2-8124023c14ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{}\n", + "46 rows in 0.06763219833374023 s.\n" + ] + }, + { + "data": { + "text/plain": [ + "46" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Connect nodes to first level community\n", + "first_df = nodes_df[nodes_df['community'].notna()]\n", + "first_statement = \"\"\"\n", + "MATCH (c:__Entity__ {name:replace(value.title,'\"','')})\n", + "MATCH (c1:__Community__ {community: value.community})\n", + "MERGE (c)-[:IN_COMMUNITY]->(c1)\n", + "RETURN count(distinct c1)\n", + "\"\"\"\n", + "batched_import(first_statement, first_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7131f3a0-2b71-4017-9dcd-24913d964dc0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}