Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions core/embedders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@
# Convenience function for backward compatibility
def create_embedder(model_name: str = "jinaai/jina-embeddings-v4", **kwargs):
"""
Create an embedder instance (backward compatibility).

Args:
model_name: Model name or path
**kwargs: Additional configuration

Create an embedder instance by delegating to EmbedderFactory.
Parameters:
model_name (str): Model identifier or factory key (defaults to "jinaai/jina-embeddings-v4").
**kwargs: Additional keyword arguments forwarded to EmbedderFactory.create.
Returns:
Embedder instance
EmbedderBase: A new embedder instance for the requested model.
"""
return EmbedderFactory.create(model_name, **kwargs)
123 changes: 76 additions & 47 deletions core/embedders/embedders_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@ class EmbedderBase(ABC):

def __init__(self, config: Optional[EmbeddingConfig] = None):
"""
Initialize embedder with configuration.

Args:
config: Embedding configuration
Initialize the embedder with an EmbeddingConfig.

If no config is provided, a default EmbeddingConfig with model_name="default" is created and assigned.
Parameters:
config (Optional[EmbeddingConfig]): Configuration for the embedder; when None a default config is used.
"""
self.config = config or EmbeddingConfig(model_name="default")

Expand All @@ -53,74 +54,91 @@ def embed_texts(self,
task: str = "retrieval",
batch_size: Optional[int] = None) -> np.ndarray:
"""
Embed a list of texts.

Args:
texts: List of texts to embed
task: Task type (retrieval, classification, etc.)
batch_size: Override default batch size

Returns:
Array of embeddings (N x D)
"""
Embed a list of texts into dense vectors.

Returns a 2-D NumPy array of shape (len(texts), embedding_dimension) where each row is the embedding for the corresponding input text.

Parameters:
texts: List of input texts to embed.
task: Optional task hint that may affect embedding behavior (default "retrieval").
batch_size: Optional override for the embedding batch size; if omitted the embedder's configured batch size is used.

Returns:
np.ndarray: 2-D array with one embedding vector per input text.
"""
pass

@abstractmethod
def embed_single(self,
text: str,
task: str = "retrieval") -> np.ndarray:
"""
Embed a single text.

Args:
text: Text to embed
task: Task type

Returns:
Embedding vector (1D array)
"""
Embed a single piece of text and return its embedding vector.
Parameters:
text (str): The input text to embed.
task (str): Optional task hint affecting embedding behavior (default "retrieval").
Returns:
numpy.ndarray: 1-D embedding vector whose length equals the embedder's embedding_dimension.
"""
pass

def embed_queries(self,
queries: List[str],
batch_size: Optional[int] = None) -> np.ndarray:
"""
Embed search queries (convenience method).

Args:
queries: List of search queries
batch_size: Override default batch size

Returns:
Array of query embeddings
"""
Embed a list of search queries into embedding vectors (convenience wrapper).

This is a thin convenience method that produces embeddings for the provided queries using the model's retrieval embedding pathway.

Parameters:
queries: List of query strings to embed.
batch_size: Optional override for the embedding batch size.

Returns:
A 2D numpy array of shape (len(queries), embedding_dimension) containing the embeddings.
"""
return self.embed_texts(queries, task="retrieval", batch_size=batch_size)

def embed_documents(self,
documents: List[str],
batch_size: Optional[int] = None) -> np.ndarray:
"""
Embed documents for retrieval (convenience method).

Args:
documents: List of documents
batch_size: Override default batch size

Returns:
Array of document embeddings
"""
Convenience wrapper to embed a list of documents for retrieval.
Parameters:
documents (List[str]): Documents to embed.
batch_size (Optional[int]): Optional override for the batch size; if None uses the embedder's configured batch size.
Returns:
np.ndarray: 2D array of shape (len(documents), embedding_dimension) containing the document embeddings.
"""
return self.embed_texts(documents, task="retrieval", batch_size=batch_size)

@property
@abstractmethod
def embedding_dimension(self) -> int:
"""Get the dimension of embeddings produced by this model."""
"""
Return the dimensionality of vectors produced by this embedder.

This is an abstract property that concrete embedder implementations must provide;
it indicates the length of each embedding vector (number of dimensions).

Returns:
int: Number of dimensions in each embedding vector.
"""
pass

@property
@abstractmethod
def max_sequence_length(self) -> int:
"""Get the maximum sequence length supported."""
"""
Maximum input sequence length supported by the embedder.

Returns:
int: The model's maximum number of input tokens (or sequence units) accepted for a single inference. Implementations should return the hard limit used for chunking and validation.
"""
pass

@property
Expand All @@ -130,15 +148,26 @@ def supports_late_chunking(self) -> bool:

@property
def supports_multimodal(self) -> bool:
"""Whether this embedder supports multimodal inputs."""
"""
Whether the embedder accepts multimodal (non-text) inputs such as images or audio.

Implementations that support embedding inputs beyond plain text should override this to return True. Defaults to False.
"""
return False

def get_model_info(self) -> Dict[str, Any]:
"""
Get information about the model.

Return a dictionary of metadata describing the embedder and its configured model.
Returns:
Dictionary with model metadata
dict: Metadata keys:
- model_name (str): Name of the configured model.
- embedding_dimension (int): Dimensionality of produced embedding vectors.
- max_sequence_length (int): Maximum supported input sequence length (tokens).
- supports_late_chunking (bool): Whether the embedder supports late chunking.
- supports_multimodal (bool): Whether the embedder accepts multimodal inputs.
- device (str): Device configured for the model (e.g., "cuda", "cpu").
- use_fp16 (bool): Whether mixed/half precision is enabled for the model.
"""
return {
"model_name": self.config.model_name,
Expand Down
77 changes: 46 additions & 31 deletions core/embedders/embedders_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ class EmbedderFactory:
@classmethod
def register(cls, name: str, embedder_class: type):
"""
Register an embedder class.

Args:
name: Name to register under
embedder_class: Embedder class to register
Register an embedder class under a given name in the factory registry.
Adds the provided embedder class to the class-level registry mapping used by
EmbedderFactory. If a different class is already registered under the same
name, it will be replaced.
"""
cls._embedders[name] = embedder_class
logger.info(f"Registered embedder: {name}")
Expand All @@ -47,19 +47,21 @@ def create(cls,
config: Optional[EmbeddingConfig] = None,
**kwargs) -> EmbedderBase:
"""
Create an embedder instance.

Args:
model_name: Name or path of the model
config: Embedding configuration
**kwargs: Additional arguments for the embedder

Returns:
Embedder instance

Raises:
ValueError: If no suitable embedder found
"""
Create and return an EmbedderBase instance for the given model.

If `config` is not provided, an EmbeddingConfig is constructed from `model_name` and any extra keyword arguments. The factory determines an embedder type from `model_name`, attempts on-demand registration for that type, and instantiates the registered embedder class.

Parameters:
model_name (str): Model identifier or path used to pick and configure the embedder.
config (Optional[EmbeddingConfig]): Pre-built embedding configuration. If omitted, one is created.
**kwargs: Additional fields forwarded to EmbeddingConfig when `config` is not supplied.

Returns:
EmbedderBase: An instance of the selected embedder class initialized with `config`.

Raises:
ValueError: If no embedder is registered (after auto-registration) for the determined embedder type.
"""
# Create config if not provided
if config is None:
config = EmbeddingConfig(model_name=model_name, **kwargs)
Expand All @@ -86,13 +88,15 @@ def create(cls,
@classmethod
def _determine_embedder_type(cls, model_name: str) -> str:
"""
Determine embedder type from model name.

Args:
model_name: Model name or path

Determine the embedder type from a model name.

Performs a case-insensitive substring check against the provided model_name and returns one of the supported embedder type keys: "jina", "sentence", "openai", or "cohere". If no known marker is found, defaults to "jina".

Parameters:
model_name: Model identifier or path used to infer the embedder type.

Returns:
Embedder type string
A string key identifying the embedder type: "jina", "sentence", "openai", or "cohere".
"""
model_lower = model_name.lower()

Expand All @@ -111,10 +115,16 @@ def _determine_embedder_type(cls, model_name: str) -> str:
@classmethod
def _auto_register(cls, embedder_type: str):
"""
Attempt to auto-register an embedder type.

Args:
embedder_type: Type of embedder to register
Auto-registers a known embedder implementation for the given embedder type.

If embedder_type == "jina", attempts to import JinaV4Embedder and register it under the name "jina".
If embedder_type == "sentence", emits a warning that the sentence-transformers embedder is not yet migrated.
For any other embedder_type, emits a warning about the unknown type.

Side effects:
- May call cls.register(...) to add an embedder to the factory registry.
- Logs warnings for unimplemented or unknown types.
- Logs an error if an ImportError occurs while attempting to import a backend (the exception is not propagated).
"""
try:
if embedder_type == "jina":
Expand All @@ -131,10 +141,15 @@ def _auto_register(cls, embedder_type: str):
@classmethod
def list_available(cls) -> Dict[str, Any]:
"""
List available embedders.

Return a mapping of registered embedder names to their basic metadata.

For each embedder registered in the factory registry, the mapping contains either:
- a dict with keys `"class"` (the embedder class name) and `"module"` (the class' module path),
or
- a dict with key `"error"` containing a string message if retrieving the info failed.

Returns:
Dictionary of available embedders with their info
Dict[str, Any]: Mapping from embedder registry name to metadata or error information.
"""
available = {}
for name, embedder_class in cls._embedders.items():
Expand Down
Loading