1- """Base classes for media and documents."""
1+ """Base classes for media and documents.
2+
3+ This module contains core abstractions for **data retrieval and processing workflows**:
4+
5+ - `BaseMedia`: Base class providing `id` and `metadata` fields
6+ - `Blob`: Raw data loading (files, binary data) - used by document loaders
7+ - `Document`: Text content for retrieval (RAG, vector stores, semantic search)
8+
9+ !!! note "Not for LLM chat messages"
10+ These classes are for data processing pipelines, not LLM I/O. For multimodal
11+ content in chat messages (images, audio in conversations), see
12+ `langchain.messages` content blocks instead.
13+ """
214
315from __future__ import annotations
416
1931
2032
2133class BaseMedia (Serializable ):
22- """Use to represent media content.
23-
24- Media objects can be used to represent raw data, such as text or binary data.
34+ """Base class for content used in retrieval and data processing workflows.
2535
26- LangChain Media objects allow associating metadata and an optional identifier
27- with the content.
36+ Provides common fields for content that needs to be stored, indexed, or searched.
2837
29- The presence of an ID and metadata make it easier to store, index, and search
30- over the content in a structured way.
38+ !!! note
39+ For multimodal content in **chat messages** (images, audio sent to/from LLMs),
40+ use `langchain.messages` content blocks instead.
3141 """
3242
3343 # The ID field is optional at the moment.
@@ -45,61 +55,60 @@ class BaseMedia(Serializable):
4555
4656
4757class Blob (BaseMedia ):
48- """Blob represents raw data by either reference or value .
58+ """Raw data abstraction for document loading and file processing .
4959
50- Provides an interface to materialize the blob in different representations, and
51- help to decouple the development of data loaders from the downstream parsing of
52- the raw data.
60+ Represents raw bytes or text, either in-memory or by file reference. Used
61+ primarily by document loaders to decouple data loading from parsing.
5362
5463 Inspired by [Mozilla's `Blob`](https://developer.mozilla.org/en-US/docs/Web/API/Blob)
5564
56- Example: Initialize a blob from in-memory data
65+ ???+ example " Initialize a blob from in-memory data"
5766
58- ```python
59- from langchain_core.documents import Blob
67+ ```python
68+ from langchain_core.documents import Blob
6069
61- blob = Blob.from_data("Hello, world!")
70+ blob = Blob.from_data("Hello, world!")
6271
63- # Read the blob as a string
64- print(blob.as_string())
72+ # Read the blob as a string
73+ print(blob.as_string())
6574
66- # Read the blob as bytes
67- print(blob.as_bytes())
75+ # Read the blob as bytes
76+ print(blob.as_bytes())
6877
69- # Read the blob as a byte stream
70- with blob.as_bytes_io() as f:
71- print(f.read())
72- ```
78+ # Read the blob as a byte stream
79+ with blob.as_bytes_io() as f:
80+ print(f.read())
81+ ```
7382
74- Example: Load from memory and specify mime- type and metadata
83+ ??? example " Load from memory and specify MIME type and metadata"
7584
76- ```python
77- from langchain_core.documents import Blob
85+ ```python
86+ from langchain_core.documents import Blob
7887
79- blob = Blob.from_data(
80- data="Hello, world!",
81- mime_type="text/plain",
82- metadata={"source": "https://example.com"},
83- )
84- ```
88+ blob = Blob.from_data(
89+ data="Hello, world!",
90+ mime_type="text/plain",
91+ metadata={"source": "https://example.com"},
92+ )
93+ ```
8594
86- Example: Load the blob from a file
95+ ??? example " Load the blob from a file"
8796
88- ```python
89- from langchain_core.documents import Blob
97+ ```python
98+ from langchain_core.documents import Blob
9099
91- blob = Blob.from_path("path/to/file.txt")
100+ blob = Blob.from_path("path/to/file.txt")
92101
93- # Read the blob as a string
94- print(blob.as_string())
102+ # Read the blob as a string
103+ print(blob.as_string())
95104
96- # Read the blob as bytes
97- print(blob.as_bytes())
105+ # Read the blob as bytes
106+ print(blob.as_bytes())
98107
99- # Read the blob as a byte stream
100- with blob.as_bytes_io() as f:
101- print(f.read())
102- ```
108+ # Read the blob as a byte stream
109+ with blob.as_bytes_io() as f:
110+ print(f.read())
111+ ```
103112 """
104113
105114 data : bytes | str | None = None
@@ -213,7 +222,7 @@ def from_path(
213222 encoding: Encoding to use if decoding the bytes into a string
214223 mime_type: If provided, will be set as the MIME type of the data
215224 guess_type: If `True`, the MIME type will be guessed from the file
216- extension, if a mime- type was not provided
225+ extension, if a MIME type was not provided
217226 metadata: Metadata to associate with the `Blob`
218227
219228 Returns:
@@ -274,6 +283,10 @@ def __repr__(self) -> str:
274283class Document (BaseMedia ):
275284 """Class for storing a piece of text and associated metadata.
276285
286+ !!! note
287+ `Document` is for **retrieval workflows**, not chat I/O. For sending text
288+ to an LLM in a conversation, use message types from `langchain.messages`.
289+
277290 Example:
278291 ```python
279292 from langchain_core.documents import Document
0 commit comments