Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions hindsight-api/hindsight_api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
ENV_EMBEDDINGS_PROVIDER = "HINDSIGHT_API_EMBEDDINGS_PROVIDER"
ENV_EMBEDDINGS_LOCAL_MODEL = "HINDSIGHT_API_EMBEDDINGS_LOCAL_MODEL"
ENV_EMBEDDINGS_LOCAL_FORCE_CPU = "HINDSIGHT_API_EMBEDDINGS_LOCAL_FORCE_CPU"
ENV_EMBEDDINGS_LOCAL_TRUST_REMOTE_CODE = "HINDSIGHT_API_EMBEDDINGS_LOCAL_TRUST_REMOTE_CODE"
ENV_EMBEDDINGS_TEI_URL = "HINDSIGHT_API_EMBEDDINGS_TEI_URL"
ENV_EMBEDDINGS_OPENAI_API_KEY = "HINDSIGHT_API_EMBEDDINGS_OPENAI_API_KEY"
ENV_EMBEDDINGS_OPENAI_MODEL = "HINDSIGHT_API_EMBEDDINGS_OPENAI_MODEL"
Expand Down Expand Up @@ -98,6 +99,7 @@
ENV_RERANKER_LOCAL_MODEL = "HINDSIGHT_API_RERANKER_LOCAL_MODEL"
ENV_RERANKER_LOCAL_FORCE_CPU = "HINDSIGHT_API_RERANKER_LOCAL_FORCE_CPU"
ENV_RERANKER_LOCAL_MAX_CONCURRENT = "HINDSIGHT_API_RERANKER_LOCAL_MAX_CONCURRENT"
ENV_RERANKER_LOCAL_TRUST_REMOTE_CODE = "HINDSIGHT_API_RERANKER_LOCAL_TRUST_REMOTE_CODE"
ENV_RERANKER_TEI_URL = "HINDSIGHT_API_RERANKER_TEI_URL"
ENV_RERANKER_TEI_BATCH_SIZE = "HINDSIGHT_API_RERANKER_TEI_BATCH_SIZE"
ENV_RERANKER_TEI_MAX_CONCURRENT = "HINDSIGHT_API_RERANKER_TEI_MAX_CONCURRENT"
Expand Down Expand Up @@ -201,13 +203,17 @@
DEFAULT_EMBEDDINGS_PROVIDER = "local"
DEFAULT_EMBEDDINGS_LOCAL_MODEL = "BAAI/bge-small-en-v1.5"
DEFAULT_EMBEDDINGS_LOCAL_FORCE_CPU = False # Force CPU mode for local embeddings (avoids MPS/XPC issues on macOS)
DEFAULT_EMBEDDINGS_LOCAL_TRUST_REMOTE_CODE = False # Security: disabled by default, required for some models
DEFAULT_EMBEDDINGS_OPENAI_MODEL = "text-embedding-3-small"
DEFAULT_EMBEDDING_DIMENSION = 384

DEFAULT_RERANKER_PROVIDER = "local"
DEFAULT_RERANKER_LOCAL_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
DEFAULT_RERANKER_LOCAL_FORCE_CPU = False # Force CPU mode for local reranker (avoids MPS/XPC issues on macOS)
DEFAULT_RERANKER_LOCAL_MAX_CONCURRENT = 4 # Limit concurrent CPU-bound reranking to prevent thrashing
DEFAULT_RERANKER_LOCAL_TRUST_REMOTE_CODE = (
False # Security: disabled by default, required for some models like jina-reranker-v2
)
DEFAULT_RERANKER_TEI_BATCH_SIZE = 128
DEFAULT_RERANKER_TEI_MAX_CONCURRENT = 8
DEFAULT_RERANKER_MAX_CANDIDATES = 300
Expand Down Expand Up @@ -404,6 +410,7 @@ class HindsightConfig:
embeddings_provider: str
embeddings_local_model: str
embeddings_local_force_cpu: bool
embeddings_local_trust_remote_code: bool
embeddings_tei_url: str | None
embeddings_openai_base_url: str | None
embeddings_cohere_api_key: str | None
Expand All @@ -418,6 +425,7 @@ class HindsightConfig:
reranker_local_model: str
reranker_local_force_cpu: bool
reranker_local_max_concurrent: int
reranker_local_trust_remote_code: bool
reranker_tei_url: str | None
reranker_tei_batch_size: int
reranker_tei_max_concurrent: int
Expand Down Expand Up @@ -607,6 +615,10 @@ def from_env(cls) -> "HindsightConfig":
ENV_EMBEDDINGS_LOCAL_FORCE_CPU, str(DEFAULT_EMBEDDINGS_LOCAL_FORCE_CPU)
).lower()
in ("true", "1"),
embeddings_local_trust_remote_code=os.getenv(
ENV_EMBEDDINGS_LOCAL_TRUST_REMOTE_CODE, str(DEFAULT_EMBEDDINGS_LOCAL_TRUST_REMOTE_CODE)
).lower()
in ("true", "1"),
embeddings_tei_url=os.getenv(ENV_EMBEDDINGS_TEI_URL),
embeddings_openai_base_url=os.getenv(ENV_EMBEDDINGS_OPENAI_BASE_URL) or None,
# Cohere embeddings (with backward-compatible fallback to shared API key)
Expand All @@ -628,6 +640,10 @@ def from_env(cls) -> "HindsightConfig":
reranker_local_max_concurrent=int(
os.getenv(ENV_RERANKER_LOCAL_MAX_CONCURRENT, str(DEFAULT_RERANKER_LOCAL_MAX_CONCURRENT))
),
reranker_local_trust_remote_code=os.getenv(
ENV_RERANKER_LOCAL_TRUST_REMOTE_CODE, str(DEFAULT_RERANKER_LOCAL_TRUST_REMOTE_CODE)
).lower()
in ("true", "1"),
reranker_tei_url=os.getenv(ENV_RERANKER_TEI_URL),
reranker_tei_batch_size=int(os.getenv(ENV_RERANKER_TEI_BATCH_SIZE, str(DEFAULT_RERANKER_TEI_BATCH_SIZE))),
reranker_tei_max_concurrent=int(
Expand Down
16 changes: 15 additions & 1 deletion hindsight-api/hindsight_api/engine/cross_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
DEFAULT_RERANKER_LOCAL_FORCE_CPU,
DEFAULT_RERANKER_LOCAL_MAX_CONCURRENT,
DEFAULT_RERANKER_LOCAL_MODEL,
DEFAULT_RERANKER_LOCAL_TRUST_REMOTE_CODE,
DEFAULT_RERANKER_PROVIDER,
DEFAULT_RERANKER_TEI_BATCH_SIZE,
DEFAULT_RERANKER_TEI_MAX_CONCURRENT,
Expand All @@ -34,6 +35,7 @@
ENV_RERANKER_LOCAL_FORCE_CPU,
ENV_RERANKER_LOCAL_MAX_CONCURRENT,
ENV_RERANKER_LOCAL_MODEL,
ENV_RERANKER_LOCAL_TRUST_REMOTE_CODE,
ENV_RERANKER_PROVIDER,
ENV_RERANKER_TEI_BATCH_SIZE,
ENV_RERANKER_TEI_MAX_CONCURRENT,
Expand Down Expand Up @@ -98,7 +100,13 @@ class LocalSTCrossEncoder(CrossEncoderModel):
_executor: ThreadPoolExecutor | None = None
_max_concurrent: int = 4 # Limit concurrent CPU-bound reranking calls

def __init__(self, model_name: str | None = None, max_concurrent: int = 4, force_cpu: bool = False):
def __init__(
self,
model_name: str | None = None,
max_concurrent: int = 4,
force_cpu: bool = False,
trust_remote_code: bool = False,
):
"""
Initialize local SentenceTransformers cross-encoder.

Expand All @@ -109,9 +117,13 @@ def __init__(self, model_name: str | None = None, max_concurrent: int = 4, force
Higher values may cause CPU thrashing under load.
force_cpu: Force CPU mode (avoids MPS/XPC issues on macOS in daemon mode).
Default: False
trust_remote_code: Allow loading models with custom code (security risk).
Required for some models like jina-reranker-v2-base-multilingual.
Default: False (disabled for security)
"""
self.model_name = model_name or DEFAULT_RERANKER_LOCAL_MODEL
self.force_cpu = force_cpu
self.trust_remote_code = trust_remote_code
self._model = None
LocalSTCrossEncoder._max_concurrent = max_concurrent

Expand Down Expand Up @@ -177,6 +189,7 @@ async def initialize(self) -> None:
self.model_name,
device=device,
model_kwargs={"low_cpu_mem_usage": False},
trust_remote_code=self.trust_remote_code,
)
finally:
# Restore original logging level
Expand Down Expand Up @@ -843,6 +856,7 @@ def create_cross_encoder_from_env() -> CrossEncoderModel:
model_name=config.reranker_local_model,
max_concurrent=config.reranker_local_max_concurrent,
force_cpu=config.reranker_local_force_cpu,
trust_remote_code=config.reranker_local_trust_remote_code,
)
elif provider == "cohere":
api_key = config.reranker_cohere_api_key
Expand Down
10 changes: 9 additions & 1 deletion hindsight-api/hindsight_api/engine/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@
DEFAULT_EMBEDDINGS_LITELLM_MODEL,
DEFAULT_EMBEDDINGS_LOCAL_FORCE_CPU,
DEFAULT_EMBEDDINGS_LOCAL_MODEL,
DEFAULT_EMBEDDINGS_LOCAL_TRUST_REMOTE_CODE,
DEFAULT_EMBEDDINGS_OPENAI_MODEL,
DEFAULT_EMBEDDINGS_PROVIDER,
DEFAULT_LITELLM_API_BASE,
ENV_EMBEDDINGS_COHERE_API_KEY,
ENV_EMBEDDINGS_LOCAL_FORCE_CPU,
ENV_EMBEDDINGS_LOCAL_MODEL,
ENV_EMBEDDINGS_LOCAL_TRUST_REMOTE_CODE,
ENV_EMBEDDINGS_OPENAI_API_KEY,
ENV_EMBEDDINGS_OPENAI_BASE_URL,
ENV_EMBEDDINGS_OPENAI_MODEL,
Expand Down Expand Up @@ -90,7 +92,7 @@ class LocalSTEmbeddings(Embeddings):
The embedding dimension is auto-detected from the model.
"""

def __init__(self, model_name: str | None = None, force_cpu: bool = False):
def __init__(self, model_name: str | None = None, force_cpu: bool = False, trust_remote_code: bool = False):
"""
Initialize local SentenceTransformers embeddings.

Expand All @@ -99,9 +101,13 @@ def __init__(self, model_name: str | None = None, force_cpu: bool = False):
Default: BAAI/bge-small-en-v1.5
force_cpu: Force CPU mode (avoids MPS/XPC issues on macOS in daemon mode).
Default: False
trust_remote_code: Allow loading models with custom code (security risk).
Required for some models with custom architectures.
Default: False (disabled for security)
"""
self.model_name = model_name or DEFAULT_EMBEDDINGS_LOCAL_MODEL
self.force_cpu = force_cpu
self.trust_remote_code = trust_remote_code
self._model = None
self._dimension: int | None = None

Expand Down Expand Up @@ -171,6 +177,7 @@ async def initialize(self) -> None:
self.model_name,
device=device,
model_kwargs={"low_cpu_mem_usage": False},
trust_remote_code=self.trust_remote_code,
)
finally:
# Restore original logging level
Expand Down Expand Up @@ -736,6 +743,7 @@ def create_embeddings_from_env() -> Embeddings:
return LocalSTEmbeddings(
model_name=config.embeddings_local_model,
force_cpu=config.embeddings_local_force_cpu,
trust_remote_code=config.embeddings_local_trust_remote_code,
)
elif provider == "openai":
# Use dedicated embeddings API key, or fall back to LLM API key
Expand Down
2 changes: 2 additions & 0 deletions hindsight-api/hindsight_api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ def main():
embeddings_provider=config.embeddings_provider,
embeddings_local_model=config.embeddings_local_model,
embeddings_local_force_cpu=config.embeddings_local_force_cpu,
embeddings_local_trust_remote_code=config.embeddings_local_trust_remote_code,
embeddings_tei_url=config.embeddings_tei_url,
embeddings_openai_base_url=config.embeddings_openai_base_url,
embeddings_cohere_api_key=config.embeddings_cohere_api_key,
Expand All @@ -209,6 +210,7 @@ def main():
reranker_local_model=config.reranker_local_model,
reranker_local_force_cpu=config.reranker_local_force_cpu,
reranker_local_max_concurrent=config.reranker_local_max_concurrent,
reranker_local_trust_remote_code=config.reranker_local_trust_remote_code,
reranker_tei_url=config.reranker_tei_url,
reranker_tei_batch_size=config.reranker_tei_batch_size,
reranker_tei_max_concurrent=config.reranker_tei_max_concurrent,
Expand Down
13 changes: 13 additions & 0 deletions hindsight-docs/docs/developer/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ export HINDSIGHT_API_RETAIN_LLM_MAX_BACKOFF=120.0 # Cap at 2min instead of 1m
|----------|-------------|---------|
| `HINDSIGHT_API_EMBEDDINGS_PROVIDER` | Provider: `local`, `tei`, `openai`, `cohere`, or `litellm` | `local` |
| `HINDSIGHT_API_EMBEDDINGS_LOCAL_MODEL` | Model for local provider | `BAAI/bge-small-en-v1.5` |
| `HINDSIGHT_API_EMBEDDINGS_LOCAL_TRUST_REMOTE_CODE` | Allow loading models with custom code (security risk, disabled by default) | `false` |
| `HINDSIGHT_API_EMBEDDINGS_TEI_URL` | TEI server URL | - |
| `HINDSIGHT_API_EMBEDDINGS_OPENAI_API_KEY` | OpenAI API key (falls back to `HINDSIGHT_API_LLM_API_KEY`) | - |
| `HINDSIGHT_API_EMBEDDINGS_OPENAI_MODEL` | OpenAI embedding model | `text-embedding-3-small` |
Expand All @@ -285,6 +286,11 @@ export HINDSIGHT_API_RETAIN_LLM_MAX_BACKOFF=120.0 # Cap at 2min instead of 1m
export HINDSIGHT_API_EMBEDDINGS_PROVIDER=local
export HINDSIGHT_API_EMBEDDINGS_LOCAL_MODEL=BAAI/bge-small-en-v1.5

# Local with custom model requiring trust_remote_code
# WARNING: Only enable trust_remote_code for models you trust (security risk)
# export HINDSIGHT_API_EMBEDDINGS_LOCAL_MODEL=your-custom-model
# export HINDSIGHT_API_EMBEDDINGS_LOCAL_TRUST_REMOTE_CODE=true

# OpenAI - cloud-based embeddings
export HINDSIGHT_API_EMBEDDINGS_PROVIDER=openai
export HINDSIGHT_API_EMBEDDINGS_OPENAI_API_KEY=sk-xxxxxxxxxxxx # or reuses HINDSIGHT_API_LLM_API_KEY
Expand Down Expand Up @@ -341,6 +347,7 @@ Supported OpenAI embedding dimensions:
| `HINDSIGHT_API_RERANKER_PROVIDER` | Provider: `local`, `tei`, `cohere`, `flashrank`, `litellm`, or `rrf` | `local` |
| `HINDSIGHT_API_RERANKER_LOCAL_MODEL` | Model for local provider | `cross-encoder/ms-marco-MiniLM-L-6-v2` |
| `HINDSIGHT_API_RERANKER_LOCAL_MAX_CONCURRENT` | Max concurrent local reranking (prevents CPU thrashing under load) | `4` |
| `HINDSIGHT_API_RERANKER_LOCAL_TRUST_REMOTE_CODE` | Allow loading models with custom code (security risk, disabled by default) | `false` |
| `HINDSIGHT_API_RERANKER_TEI_URL` | TEI server URL | - |
| `HINDSIGHT_API_RERANKER_TEI_BATCH_SIZE` | Batch size for TEI reranking | `128` |
| `HINDSIGHT_API_RERANKER_TEI_MAX_CONCURRENT` | Max concurrent TEI reranking requests | `8` |
Expand All @@ -358,6 +365,12 @@ Supported OpenAI embedding dimensions:
export HINDSIGHT_API_RERANKER_PROVIDER=local
export HINDSIGHT_API_RERANKER_LOCAL_MODEL=cross-encoder/ms-marco-MiniLM-L-6-v2

# Local with custom model requiring trust_remote_code (e.g., jina-reranker-v2)
# WARNING: Only enable trust_remote_code for models you trust (security risk)
export HINDSIGHT_API_RERANKER_PROVIDER=local
export HINDSIGHT_API_RERANKER_LOCAL_MODEL=jinaai/jina-reranker-v2-base-multilingual
export HINDSIGHT_API_RERANKER_LOCAL_TRUST_REMOTE_CODE=true

# TEI - for high-performance inference
export HINDSIGHT_API_RERANKER_PROVIDER=tei
export HINDSIGHT_API_RERANKER_TEI_URL=http://localhost:8081
Expand Down
Loading