neo4j · stellasia · Aug 13, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
diff --git a/examples/pipeline/kg_builder.py b/examples/pipeline/kg_builder.py
@@ -20,6 +20,7 @@
 
 import neo4j
 from langchain_text_splitters import CharacterTextSplitter
+from neo4j_genai.components.embedder import TextChunkEmbedder
 from neo4j_genai.components.entity_relation_extractor import (
     LLMEntityRelationExtractor,
     OnError,
@@ -32,6 +33,7 @@
     SchemaRelation,
 )
 from neo4j_genai.components.text_splitters.langchain import LangChainTextSplitterAdapter
+from neo4j_genai.embeddings.openai import OpenAIEmbeddings
 from neo4j_genai.llm import OpenAILLM
 from neo4j_genai.pipeline import Pipeline
 
@@ -78,6 +80,7 @@ async def main(neo4j_driver: neo4j.Driver) -> dict[str, Any]:
             CharacterTextSplitter(chunk_size=50, chunk_overlap=10, separator=".")
         ),
     )
+    pipe.add_component("chunk_embedder", TextChunkEmbedder(embedder=OpenAIEmbeddings()))
     pipe.add_component("schema", SchemaBuilder())
     pipe.add_component(
         "extractor",
@@ -95,8 +98,11 @@ async def main(neo4j_driver: neo4j.Driver) -> dict[str, Any]:
     pipe.add_component("writer", Neo4jWriter(neo4j_driver))
     # define the execution order of component
     # and how the output of previous components must be used
-    pipe.connect("splitter", "extractor", input_config={"chunks": "splitter"})
+    pipe.connect("splitter", "chunk_embedder", input_config={"text_chunks": "splitter"})
     pipe.connect("schema", "extractor", input_config={"schema": "schema"})
+    pipe.connect(
+        "chunk_embedder", "extractor", input_config={"chunks": "chunk_embedder"}
+    )
     pipe.connect(
         "extractor",
         "writer",

diff --git a/src/neo4j_genai/components/embedder.py b/src/neo4j_genai/components/embedder.py
@@ -12,6 +12,8 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+from pydantic import validate_call
+
 from neo4j_genai.components.types import TextChunk, TextChunks
 from neo4j_genai.embedder import Embedder
 from neo4j_genai.pipeline.component import Component
@@ -42,6 +44,7 @@ def _embed_chunk(self, text_chunk: TextChunk) -> TextChunk:
         metadata["embedding"] = embedding
         return TextChunk(text=text_chunk.text, metadata=metadata)
 
+    @validate_call
     async def run(self, text_chunks: TextChunks) -> TextChunks:
         """Embed a list of text chunks.
 

diff --git a/src/neo4j_genai/components/entity_relation_extractor.py b/src/neo4j_genai/components/entity_relation_extractor.py
@@ -91,16 +91,22 @@ def create_next_chunk_relationship(
         )
 
     def create_chunk_node(self, chunk: TextChunk, chunk_id: str) -> Neo4jNode:
-        """Create chunk node with properties 'text' and 'metadata' if metadata is defined."""
+        """Create chunk node with properties 'text' and any 'metadata' added during
+        the process. Special case for the potential chunk embedding property that
+        gets added as an embedding_property"""
         chunk_properties: Dict[str, Any] = {
             "text": chunk.text,
         }
+        embedding_properties = {}
         if chunk.metadata:
-            chunk_properties["metadata"] = chunk.metadata
+            if "embedding" in chunk.metadata:
+                embedding_properties["embedding"] = chunk.metadata.pop("embedding")
+            chunk_properties.update(chunk.metadata)
         return Neo4jNode(
             id=chunk_id,
             label=CHUNK_NODE_LABEL,
             properties=chunk_properties,
+            embedding_properties=embedding_properties,
         )
 
     def create_node_to_chunk_rel(
@@ -162,10 +168,10 @@ async def extract_for_chunk(
         llm_result = self.llm.invoke(prompt)
         try:
             result = json.loads(llm_result.content)
-        except json.JSONDecodeError:
+        except json.JSONDecodeError as e:
             if self.on_error == OnError.RAISE:
                 raise LLMGenerationError(
-                    f"LLM response is not valid JSON {llm_result.content}"
+                    f"LLM response is not valid JSON {llm_result.content}: {e}"
                 )
             else:
                 logger.error(

diff --git a/tests/e2e/data/harry_potter.txt b/tests/e2e/data/harry_potter.txt
@@ -0,0 +1,15 @@
+At Malfoy Manor, Snape tells Voldemort the date that Harry’s friends are planning to
+move him from the house on Privet Drive to a new safe location, so that Voldemort
+can capture Harry en route.
+
+As Harry packs to leave Privet Drive, he reads two obituaries for Dumbledore, both
+of which make him think that he didn’t know Dumbledore as well as he should have.
+Downstairs, he bids good-bye to the Dursleys for the final time, as the threat of
+Voldemort forces them to go into hiding themselves.
+
+The Order of the Phoenix, led by Alastor “Mad-Eye” Moody, arrives to take Harry to
+his new home at the Weasleys’ house, the Burrow. Six of Harry’s friends take
+Polyjuice Potion to disguise themselves as Harry and act as decoys, and they all fly
+off in different directions. The Death Eaters, alerted to their departure by Snape,
+attack Harry and his friends. Voldemort chases Harry down, but Harry’s wand fends
+Voldemort off, seemingly without Harry’s help.
diff --git a/tests/e2e/pinecone_e2e/populate_dbs.py b/tests/e2e/pinecone_e2e/populate_dbs.py
@@ -14,7 +14,6 @@
 #  limitations under the License.
 from __future__ import annotations
 
-import os.path
 from typing import Any
 
 import neo4j
@@ -23,8 +22,6 @@
 
 from ..utils import build_data_objects, populate_neo4j
 
-BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-
 
 def populate_dbs(
     neo4j_driver: neo4j.Driver, pc_client: Pinecone, index_name: str = "jeopardy"