add embedder png

pgmiso · Feb 13, 2025 · 7d0c5b9 · 7d0c5b9
1 parent 5f0fbb8
commit 7d0c5b9
Show file tree

Hide file tree

Showing 11 changed files with 98 additions and 79 deletions.
diff --git a/adalflow/adalflow/__init__.py b/adalflow/adalflow/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.0.3"
+__version__ = "1.0.4"
 
 from adalflow.core.component import (
     Component,
@@ -21,7 +21,7 @@
     Document,
 )
 from adalflow.core.model_client import ModelClient
-from adalflow.core.embedder import Embedder
+from adalflow.core.embedder import Embedder, BatchEmbedder
 
 # parser
 from adalflow.core.string_parser import (
@@ -74,7 +74,6 @@
 
 # data pipeline
 from adalflow.components.data_process.text_splitter import TextSplitter
-from adalflow.components.data_process.data_components import ToEmbeddings
 
 __all__ = [
     "Component",
@@ -96,6 +95,7 @@
     "ModelClient",
     "Generator",
     "Embedder",
+    "BatchEmbedder",
     "Retriever",
     "Parameter",
     "AdalComponent",

diff --git a/adalflow/adalflow/components/data_process/__init__.py b/adalflow/adalflow/components/data_process/__init__.py
@@ -1,11 +1,11 @@
 """Components here are used for data processing/transformation."""
 
 from .text_splitter import TextSplitter
-from .data_components import ToEmbeddings, RetrieverOutputToContextStr
+from .data_components import RetrieverOutputToContextStr
 from adalflow.utils.registry import EntityMapping
 
 
-__all__ = ["TextSplitter", "ToEmbeddings", "RetrieverOutputToContextStr"]
+__all__ = ["TextSplitter", "RetrieverOutputToContextStr"]
 
 for name in __all__:
     EntityMapping.register(name, globals()[name])
diff --git a/adalflow/adalflow/components/data_process/data_components.py b/adalflow/adalflow/components/data_process/data_components.py
@@ -88,7 +88,7 @@ def __call__(self, input: ToEmbeddingsInputType) -> ToEmbeddingsOutputType:
         # convert documents to a list of strings
         embedder_input: BatchEmbedderInputType = [chunk.text for chunk in output]
         outputs: BatchEmbedderOutputType = self.batch_embedder(input=embedder_input)
-        # n them back to the original order along with its query
+        # put them back to the original order along with its query
         for batch_idx, batch_output in tqdm(
             enumerate(outputs), desc="Adding embeddings to documents from batch"
         ):

diff --git a/adalflow/adalflow/core/types.py b/adalflow/adalflow/core/types.py
@@ -141,7 +141,7 @@ class Usage:
 
 @dataclass
 class EmbedderOutput(DataClass):
-    __doc__ = r"""Container to hold the response from an Embedder model. Only Per-batch.
+    __doc__ = r"""Container to hold the response from an Embedder datacomponent for a single batch of input.
 
     Data standard for Embedder model output to interact with other components.
     Batch processing is often available, thus we need a list of Embedding objects.

diff --git a/adalflow/pyproject.toml b/adalflow/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "adalflow"
-version = "1.0.3"
+version = "1.0.4"
 description = "The Library to Build and Auto-optimize LLM Applications"
 authors = ["Li Yin <li@sylphai.com>"]
 readme = "README.md"
@@ -36,7 +36,7 @@ jsonlines = "^4.0.0"
 tiktoken = ">=0.3.3"
 numpy = [
     { version = "<2.1.0", markers = "python_version < '3.10'" },
-    { version = "*", markers = "python_version >= '3.10'" }
+    { version = "*", markers = "python_version >= '3.10'" },
 ]
 tqdm = "^4.66.4"
 PyYAML = ">=6.0.1"
@@ -105,10 +105,7 @@ anthropic = ["anthropic"]
 cohere = ["cohere"]
 google-generativeai = ["google-generativeai"]
 ollama = ["ollama"]
-azure = [
-    "azure-core",
-    "azure-identity"
-]
+azure = ["azure-core", "azure-identity"]
 bedrock = ["boto3"]
 together = ["together"]
 mistralai = ["mistralai"]
@@ -139,5 +136,5 @@ exclude = ["images"]
 lint.extend-ignore = [
     "E402",  # Ignore module-level import issues
     "E731",
-    "UP007"  # Disable warning for Union types formatting in Python 3.8
+    "UP007", # Disable warning for Union types formatting in Python 3.8
 ]
diff --git a/docs/source/_static/images/embedder.png b/docs/source/_static/images/embedder.png
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -1,5 +1,5 @@
 
-*"Say Goodbye to Manual Prompting and No More Vendor Lock-in"*
+*"Say Goodbye to Manual Prompting"*
 
 Getting Started: Install AdalFlow and Run Your First Query
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -9,8 +9,10 @@ Getting Started: Install AdalFlow and Run Your First Query
 
    pip install -U adalflow
 
-.. tabs::
+LM apps often relys on other cloud or local model services and each of them often has their own Python SDKs.
+AdalFlow handles all of them as optional packages, so that developers only need to install the ones they need.
 
+.. tabs::
 
    .. tab:: OpenAI
 
@@ -601,30 +603,21 @@ Community
    :hidden:
 
    new_tutorials/index
-   .. :caption: Tutorials - How each part works
-   .. :hidden:
 
 
-.. .. Hide the use cases for now
 .. toctree::
+   :glob:
    :maxdepth: 1
    :hidden:
 
-   use_cases/index
-
-
-      .. :caption: Benchmarks
+   integrations/index
 
-      .. Manually add documents for the code in benchmarks
-
-
-..    :glob:
-..    :maxdepth: 1
-..    :caption: Resources
+.. toctree::
+   :maxdepth: 1
+   :hidden:
 
-..    resources/index
+   use_cases/index
 
-.. hide the for contributors now
 
 .. toctree::
    :glob:

diff --git a/docs/source/new_tutorials/integrations.rst → docs/source/integrations/integrations.rst b/docs/source/new_tutorials/integrations.rst → docs/source/integrations/integrations.rst
@@ -1,9 +1,9 @@
 .. _get_started-integrations:
 
-Integrations
-===========
+All Providers
+==================
 
-AdalFlow integrates with many popular AI and database platforms to provide a comprehensive solution for your LLM applications.
+AdalFlow integrates with many popular AI and database platforms to provide a comprehensive solution for your LM applications.
 
 Model Providers
 -------------
@@ -173,24 +173,6 @@ Embedding and Reranking Models
      - Chunking large text into smaller segments for more efficient and accurate embedding, retrieval, and LLM context processing.
      - :ref:`TextSplitter <tutorials-text_splitter>`
 
-.. .. list-table:: Embeddings and Reranking
-..    :widths: 25 55 20
-..    :header-rows: 1
-
-..    * - **Major Class**
-..      - **Description**
-..      - **Reference**
-..    * - **Embedding Models**
-..      - Models that convert text (or other data) into high-dimensional vectors. A core step for vector similarity or semantic understanding.
-..        Examples include OpenAI Embeddings, Hugging Face transformers, etc.
-..      - “core.embedder.Embedder” docs
-..    * - **Re-ranking Models**
-..      - Models that reorder or refine retrieved candidates based on more advanced semantic understanding or specialized metrics.
-..        Improves final retrieval accuracy.
-..      - “Rerank” doc references (BERT, Cohere, etc.)
-..    * - **LLM-based Retrieval** (optional)
-..      - Using an LLM directly for retrieval or re-ranking. Often more expensive but can be more accurate for certain tasks.
-..      - Could be integrated in your LLM pipeline
 
 .. raw:: html
 

diff --git a/docs/source/new_tutorials/embedder.rst b/docs/source/new_tutorials/embedder.rst
@@ -16,17 +16,24 @@
 Embedder
 ============
 
+.. figure:: /_static/images/embedder.png
+    :align: center
+    :alt: AdalFlow generator design
+    :width: 700px
+
+    Embedder - Converts a list of strings into a list of vectors with embedding models.
 
 Introduction
 ------------------
 
 :class:`core.embedder.Embedder` allows developers to use different embedding models easily.
 Like `Generator`, `Embedder` is a user-facing component that orchestrates embedding models via `ModelClient` and `output_processors`, it outputs :class:`EmbedderOutput<core.types.EmbedderOutput>`.
-Unlike `Generator` which is trainable, `Embedder` is just a `DataComponent` that likely transforms input strings into embeddings/vectors.
+Unlike `Generator` which is trainable, `Embedder` is just a `DataComponent` that only transforms input strings into embeddings/vectors.
 
 
 By switching the ``ModelClient``, you can use different embedding models in your task pipeline easily, or even embedd different data such as text, image, etc.
-
+For end developers, most likely you want to use :class:`ToEmbeddings<components.data_process.data_components.ToEmbeddings>` together with `Embedder` as it (1) directly supports a sequence of `Document` objects, and (2) it handles batch processing out of box.
+:class:`Document<core.types.Document>` is a container that AdalFlow uses to also process data in :class:`TextSplitter<components.data_process.text_splitter.TextSplitter>` which are often required in a RAG pipeline.
 .. EmbedderOutput
 .. --------------
 
@@ -45,13 +52,14 @@ By switching the ``ModelClient``, you can use different embedding models in your
 
 
 We currently support `all embedding models from OpenAI <https://platform.openai.com/docs/guides/embeddings>`_ and `'thenlper/gte-base' <https://huggingface.co/thenlper/gte-base>`_ from HuggingFace `transformers <https://huggingface.co/docs/transformers/en/index>`_.
-We will use these two to demonstrate how to use ``Embedder``, one from the API provider and the other using local model. For the local model, you might need to ensure ``transformers`` is installed.
-
-.. note ::
-    The ``output_processors`` can be a component or a ``Sequential`` container to chain together multiple components. The output processors are applied in order and is adapted only on the ``data`` field of the ``EmbedderOutput``.
+We will use these two to demonstrate how to use ``Embedder``. For the local model, you need to ensure you have ``transformers`` installed.
 
-Use OpenAI embedding models
+Use Embedder
 ----------------------------
+OpenAI Embedding Model
+^^^^^^^^^^^^^^^^^^^^^
+
+
 Before you start ensure you config the API key either in the environment variable or `.env` file, or directly pass it to the ``OpenAIClient``.
 
 .. code-block:: python
@@ -68,7 +76,7 @@ Before you start ensure you config the API key either in the environment variabl
         "encoding_format": "float",
     }
 
-    query = "What is the capital of China?"
+    query = "What is LLM?"
 
     queries = [query] * 100
 
@@ -95,16 +103,16 @@ Run the embedder and print the length and embedding dimension of the output.
     # 1 256 True
 
 
-**Embed batch queries**:
+**Embed a single batch of queries**:
 
 .. code-block:: python
 
     output = embedder(queries)
     print(output.length, output.embedding_dim)
     # 100 256
 
-Use Local Model
--------------------
+Local Model
+^^^^^^^^^^^^^^^^^^^^^
 Set up the embedder with the local model.
 
 .. code-block:: python
@@ -128,7 +136,8 @@ Now, call the embedder with the same query and queries.
     # 100 768 True
 
 Use Output Processors
-----------------------
+^^^^^^^^^^^^^^^^^^^^^
+
 If we want to decreate the embedding dimension to only 256 to save memory, we can customize an additional output processing step and pass it to embedder via the ``output_processors`` argument.
 
 .. code-block:: python
@@ -192,8 +201,24 @@ Run a query:
     # 1 256 True
 
 
-BatchEmbedder
---------------
+ToEmbeddings
+----------------
+Once we know how to config and set up Embedder, we can use :class:`ToEmbeddings<components.data_process.data_components.ToEmbeddings>` to directly convert a list of `Document` objects into embeddings.
+
+.. code-block:: python
+
+    from adalflow.components.data_process.data_components import ToEmbeddings
+    from adalflow.core.types import Document
+
+    to_embeddings = ToEmbeddings(embedder=embedder, batch_size=100)
+
+    docs = [Document(text="What is LLM?")] * 1000
+    output = to_embeddings(docs)
+    print(f"Response - Length: {len(response)})")
+    # 1000
+
+[Optional]BatchEmbedder
+--------------------------
 Especially in data processing pipelines, you can often have more than 1000 queries to embed. We need to chunk our queries into smaller batches to avoid memory overflow.
 :class:`core.embedder.BatchEmbedder` is designed to handle this situation. For now, the code is rather simple, but in the future it can be extended to support multi-processing when you use AdalFlow in production data pipeline.
 

diff --git a/docs/source/new_tutorials/index.rst b/docs/source/new_tutorials/index.rst
@@ -15,4 +15,3 @@ Tutorials
    parser
    generator
    embedder
-   integrations