Skip to content

feat: add py.typed; adjust Component protocol #9329

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions e2e/pipelines/test_evaluation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ def indexing_pipeline(documents: List[Document]):
doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
doc_embedder = SentenceTransformersDocumentEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False)
ingestion_pipe = Pipeline()
ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder") # type: ignore
ingestion_pipe.add_component(instance=doc_writer, name="doc_writer") # type: ignore
ingestion_pipe.add_component(instance=doc_embedder, name="doc_embedder")
ingestion_pipe.add_component(instance=doc_writer, name="doc_writer")
ingestion_pipe.connect("doc_embedder.documents", "doc_writer.documents")
ingestion_pipe.run({"doc_embedder": {"documents": documents}})
return document_store


def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int): # type: ignore
def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int):
"""RAG pipeline"""
template = [
ChatMessage.from_system(
Expand All @@ -59,11 +59,11 @@ def rag_pipeline(document_store: InMemoryDocumentStore, top_k: int): # type: ig
),
]
rag = Pipeline()
rag.add_component("embedder", SentenceTransformersTextEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False)) # type: ignore
rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k)) # type: ignore
rag.add_component("prompt_builder", ChatPromptBuilder(template=template)) # type: ignore
rag.add_component("generator", OpenAIChatGenerator(model="gpt-4o-mini")) # type: ignore
rag.add_component("answer_builder", AnswerBuilder()) # type: ignore
rag.add_component("embedder", SentenceTransformersTextEmbedder(model=EMBEDDINGS_MODEL, progress_bar=False))
rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store, top_k=top_k))
rag.add_component("prompt_builder", ChatPromptBuilder(template=template))
rag.add_component("generator", OpenAIChatGenerator(model="gpt-4o-mini"))
rag.add_component("answer_builder", AnswerBuilder())
rag.connect("embedder", "retriever.query_embedding")
rag.connect("retriever", "prompt_builder.documents")
rag.connect("prompt_builder", "generator")
Expand Down
24 changes: 11 additions & 13 deletions haystack/components/converters/multi_file_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,24 +80,22 @@ def __init__(self, encoding: str = "utf-8", json_content_key: str = "content") -
# Create pipeline and add components
pp = Pipeline()

# We use type ignore here to avoid type checking errors
# This is due to how the run method within the Component protocol is defined
pp.add_component("router", router) # type: ignore[arg-type]
pp.add_component("docx", DOCXToDocument(link_format="markdown")) # type: ignore[arg-type]
pp.add_component("router", router)
pp.add_component("docx", DOCXToDocument(link_format="markdown"))
pp.add_component(
"html",
HTMLToDocument( # type: ignore[arg-type]
HTMLToDocument(
extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True}
),
)
pp.add_component("json", JSONConverter(content_key=self.json_content_key)) # type: ignore[arg-type]
pp.add_component("md", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type]
pp.add_component("text", TextFileToDocument(encoding=self.encoding)) # type: ignore[arg-type]
pp.add_component("pdf", PyPDFToDocument()) # type: ignore[arg-type]
pp.add_component("pptx", PPTXToDocument()) # type: ignore[arg-type]
pp.add_component("xlsx", XLSXToDocument()) # type: ignore[arg-type]
pp.add_component("joiner", DocumentJoiner()) # type: ignore[arg-type]
pp.add_component("csv", CSVToDocument(encoding=self.encoding)) # type: ignore[arg-type]
pp.add_component("json", JSONConverter(content_key=self.json_content_key))
pp.add_component("md", TextFileToDocument(encoding=self.encoding))
pp.add_component("text", TextFileToDocument(encoding=self.encoding))
pp.add_component("pdf", PyPDFToDocument())
pp.add_component("pptx", PPTXToDocument())
pp.add_component("xlsx", XLSXToDocument())
pp.add_component("joiner", DocumentJoiner())
pp.add_component("csv", CSVToDocument(encoding=self.encoding))

for mime_type in ConverterMimeType:
pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1])
Expand Down
6 changes: 2 additions & 4 deletions haystack/components/preprocessors/document_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,8 @@ def __init__( # noqa: PLR0913 (too-many-arguments)
# Build the Pipeline
pp = Pipeline()

# We use type ignore here to avoid type checking errors
# This is due to how the run method within the Component protocol is defined
pp.add_component("splitter", splitter) # type: ignore[arg-type]
pp.add_component("cleaner", cleaner) # type: ignore[arg-type]
pp.add_component("splitter", splitter)
pp.add_component("cleaner", cleaner)

# Connect the splitter output to cleaner
pp.connect("splitter.documents", "cleaner.documents")
Expand Down
29 changes: 23 additions & 6 deletions haystack/core/component/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,29 @@ def run(self, **kwargs):
isinstance(MyComponent, Component)
"""

# This is the most reliable way to define the protocol for the `run` method.
# Defining a method doesn't work as different Components will have different
# arguments. Even defining here a method with `**kwargs` doesn't work as the
# expected signature must be identical.
# This makes most Language Servers and type checkers happy and shows less errors.
run: Callable[..., Dict[str, Any]]
# The following expression defines a run method compatible with any input signature.
# Its type is equivalent to Callable[..., Dict[str, Any]].
# See https://typing.python.org/en/latest/spec/callables.html#meaning-of-in-callable.
#
# Using `run: Callable[..., Dict[str, Any]]` directly leads to type errors: the protocol would expect a settable
# attribute `run`, while the actual implementation is a read-only method.
Comment on lines +167 to +168
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be addressed by defining run as a property getter instead: #9344

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, yours also seems like a valid solution.
The one proposed in this PR seems more explicit to me, but I would say it's a matter of taste.

# For example:
# from haystack import Pipeline, component
# @component
# class MyComponent:
# @component.output_types(out=str)
# def run(self):
# return {"out": "Hello, world!"}
# pipeline = Pipeline()
# pipeline.add_component("my_component", MyComponent())
#
# mypy raises:
# error: Argument 2 to "add_component" of "PipelineBase" has incompatible type "MyComponent"; expected "Component"
# [arg-type]
# note: Protocol member Component.run expected settable variable, got read-only attribute

def run(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: # pylint: disable=missing-function-docstring # noqa: D102
...


class ComponentMeta(type):
Expand Down
Empty file added haystack/py.typed
Empty file.
9 changes: 9 additions & 0 deletions releasenotes/notes/py-typed-724eea7222640e6d.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
upgrade:
- |
We've added a `py.typed` file to Haystack to enable type information to be used by downstream projects, in line
with PEP 561. This means Haystack's type hints will now be visible to type checkers in projects that depend on it.
Haystack is primarily type checked using mypy (not pyright) and, despite our efforts, some type information can
be incomplete or unreliable.
If you use static type checking in your own project, you may notice some changes: previously, Haystack's types were
effectively treated as `Any`, but now actual type information will be available and enforced.
Loading