deepset-ai · anakin87 · May 7, 2025 · Apr 30, 2025 · May 5, 2025 · May 5, 2025
@@ -36,14 +36,14 @@ def indexing_pipeline(documents: List[Document]):
     doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
     doc_embedder = SentenceTransformersDocumentEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False)
     ingestion_pipe = Pipeline()
-    ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder")  # type: ignore
-    ingestion_pipe.add_component(instance=doc_writer, name="doc_writer")  # type: ignore
+    ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder")
+    ingestion_pipe.add_component(instance=doc_writer, name="doc_writer")
     ingestion_pipe.connect("doc_embedder.documents", "doc_writer.documents")
     ingestion_pipe.run({"doc_embedder": {"documents": documents}})
     return document_store
 
 
-def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int):  # type: ignore
+def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int):
     """RAG pipeline"""
     template = [
         ChatMessage.from_system(
@@ -59,11 +59,11 @@ def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int):  # type: ig
         ),
     ]
     rag = Pipeline()
-    rag.add_component("embedder", SentenceTransformersTextEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False))  # type: ignore
-    rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k))  # type: ignore
-    rag.add_component("prompt_builder", ChatPromptBuilder(template=template))  # type: ignore
-    rag.add_component("generator", OpenAIChatGenerator(model="gpt-4o-mini"))  # type: ignore
-    rag.add_component("answer_builder", AnswerBuilder())  # type: ignore
+    rag.add_component("embedder", SentenceTransformersTextEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False))
+    rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k))
+    rag.add_component("prompt_builder", ChatPromptBuilder(template=template))
+    rag.add_component("generator", OpenAIChatGenerator(model="gpt-4o-mini"))
+    rag.add_component("answer_builder", AnswerBuilder())
     rag.connect("embedder", "retriever.query_embedding")
     rag.connect("retriever", "prompt_builder.documents")
     rag.connect("prompt_builder", "generator")

@@ -80,24 +80,22 @@ def __init__(self, encoding: str = "utf-8", json_content_key: str = "content") -
         # Create pipeline and add components
         pp = Pipeline()
 
-        # We use type ignore here to avoid type checking errors
-        # This is due to how the run method within the Component protocol is defined
-        pp.add_component("router", router)  # type: ignore[arg-type]
-        pp.add_component("docx", DOCXToDocument(link_format="markdown"))  # type: ignore[arg-type]
+        pp.add_component("router", router)
+        pp.add_component("docx", DOCXToDocument(link_format="markdown"))
         pp.add_component(
             "html",
-            HTMLToDocument(  # type: ignore[arg-type]
+            HTMLToDocument(
                 extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True}
             ),
         )
-        pp.add_component("json", JSONConverter(content_key=self.json_content_key))  # type: ignore[arg-type]
-        pp.add_component("md", TextFileToDocument(encoding=self.encoding))  # type: ignore[arg-type]
-        pp.add_component("text", TextFileToDocument(encoding=self.encoding))  # type: ignore[arg-type]
-        pp.add_component("pdf", PyPDFToDocument())  # type: ignore[arg-type]
-        pp.add_component("pptx", PPTXToDocument())  # type: ignore[arg-type]
-        pp.add_component("xlsx", XLSXToDocument())  # type: ignore[arg-type]
-        pp.add_component("joiner", DocumentJoiner())  # type: ignore[arg-type]
-        pp.add_component("csv", CSVToDocument(encoding=self.encoding))  # type: ignore[arg-type]
+        pp.add_component("json", JSONConverter(content_key=self.json_content_key))
+        pp.add_component("md", TextFileToDocument(encoding=self.encoding))
+        pp.add_component("text", TextFileToDocument(encoding=self.encoding))
+        pp.add_component("pdf", PyPDFToDocument())
+        pp.add_component("pptx", PPTXToDocument())
+        pp.add_component("xlsx", XLSXToDocument())
+        pp.add_component("joiner", DocumentJoiner())
+        pp.add_component("csv", CSVToDocument(encoding=self.encoding))
 
         for mime_type in ConverterMimeType:
             pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1])

@@ -127,10 +127,8 @@ def __init__(  # noqa: PLR0913 (too-many-arguments)
         # Build the Pipeline
         pp = Pipeline()
 
-        # We use type ignore here to avoid type checking errors
-        # This is due to how the run method within the Component protocol is defined
-        pp.add_component("splitter", splitter)  # type: ignore[arg-type]
-        pp.add_component("cleaner", cleaner)  # type: ignore[arg-type]
+        pp.add_component("splitter", splitter)
+        pp.add_component("cleaner", cleaner)
 
         # Connect the splitter output to cleaner
         pp.connect("splitter.documents", "cleaner.documents")

@@ -160,12 +160,29 @@ def run(self, **kwargs):
         isinstance(MyComponent, Component)
     """
 
-    # This is the most reliable way to define the protocol for the `run` method.
-    # Defining a method doesn't work as different Components will have different
-    # arguments. Even defining here a method with `**kwargs` doesn't work as the
-    # expected signature must be identical.
-    # This makes most Language Servers and type checkers happy and shows less errors.
-    run: Callable[..., Dict[str, Any]]
+    # The following expression defines a run method compatible with any input signature.
+    # Its type is equivalent to Callable[..., Dict[str, Any]].
+    # See https://typing.python.org/en/latest/spec/callables.html#meaning-of-in-callable.
+    #
+    # Using `run: Callable[..., Dict[str, Any]]` directly leads to type errors: the protocol would expect a settable
+    # attribute `run`, while the actual implementation is a read-only method.
+    # For example:
+    # from haystack import Pipeline, component
+    # @component
+    # class MyComponent:
+    #     @component.output_types(out=str)
+    #     def run(self):
+    #         return {"out": "Hello, world!"}
+    # pipeline = Pipeline()
+    # pipeline.add_component("my_component", MyComponent())
+    #
+    # mypy raises:
+    # error: Argument 2 to "add_component" of "PipelineBase" has incompatible type "MyComponent"; expected "Component"
+    # [arg-type]
+    # note: Protocol member Component.run expected settable variable, got read-only attribute
+
+    def run(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:  # pylint: disable=missing-function-docstring # noqa: D102
+        ...
 
 
 class ComponentMeta(type):

@@ -0,0 +1,9 @@
+---
+upgrade:
+  - |
+    We've added a `py.typed` file to Haystack to enable type information to be used by downstream projects, in line
+    with PEP 561. This means Haystack's type hints will now be visible to type checkers in projects that depend on it.
+    Haystack is primarily type checked using mypy (not pyright) and, despite our efforts, some type information can
+    be incomplete or unreliable.
+    If you use static type checking in your own project, you may notice some changes: previously, Haystack's types were
+    effectively treated as `Any`, but now actual type information will be available and enforced.