Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -237,3 +237,4 @@ TASK_MEMORY.md
*.code-workspace
/agent-memory-client/agent-memory-client-java/.gradle/
augment*.md
dev_docs/
9 changes: 8 additions & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ query = VectorQuery(vector=embedding, vector_field_name="vector", return_fields=

## Critical Rules

### Import Placement
Place all imports at the top of modules, not inside functions. Inline imports should only be used when strictly necessary (e.g., avoiding circular dependencies, optional dependencies, or significant startup performance concerns).

### Authentication
- **PRODUCTION**: Never set `DISABLE_AUTH=true` in production
- **DEVELOPMENT**: Use `DISABLE_AUTH=true` for local testing only
Expand Down Expand Up @@ -149,7 +152,11 @@ agent_memory_server/
├── summarization.py # Conversation summarization
├── extraction.py # Topic and entity extraction
├── filters.py # Search filtering logic
├── llms.py # LLM provider integrations
├── llm/ # LLM client package (LiteLLM-based)
│ ├── __init__.py # Re-exports for clean imports
│ ├── client.py # LLMClient class with chat/embedding methods
│ ├── types.py # ChatCompletionResponse, EmbeddingResponse, LLMBackend
│ └── exceptions.py # LLMClientError, ModelValidationError, APIKeyMissingError
├── migrations.py # Database schema migrations
├── docket_tasks.py # Background task definitions
├── cli.py # Command-line interface
Expand Down
10 changes: 4 additions & 6 deletions agent_memory_server/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from agent_memory_server.config import settings
from agent_memory_server.dependencies import HybridBackgroundTasks
from agent_memory_server.filters import SessionId, UserId
from agent_memory_server.llms import get_model_client, get_model_config
from agent_memory_server.llm import LLMClient
from agent_memory_server.logging import get_logger
from agent_memory_server.models import (
AckResponse,
Expand Down Expand Up @@ -101,7 +101,7 @@ def _get_effective_token_limit(
return context_window_max
# If model_name is provided, get its max_tokens from our config
if model_name is not None:
model_config = get_model_config(model_name)
model_config = LLMClient.get_model_config(model_name)
return model_config.max_tokens
# Otherwise use a conservative default (GPT-3.5 context window)
return 16000 # Conservative default
Expand Down Expand Up @@ -238,9 +238,8 @@ async def _summarize_working_memory(
if current_tokens <= token_threshold:
return memory

# Get model client for summarization
client = await get_model_client(model)
model_config = get_model_config(model)
# Get model config for summarization
model_config = LLMClient.get_model_config(model)
summarization_max_tokens = model_config.max_tokens

# Token allocation for summarization (same logic as original summarize_session)
Expand Down Expand Up @@ -305,7 +304,6 @@ async def _summarize_working_memory(
# Generate summary
summary, summary_tokens_used = await _incremental_summary(
model,
client,
memory.context, # Use existing context as base
messages_to_summarize,
)
Expand Down
22 changes: 5 additions & 17 deletions agent_memory_server/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,7 @@
# Lazy-import transformers in get_ner_model to avoid heavy deps at startup
from agent_memory_server.config import settings
from agent_memory_server.filters import DiscreteMemoryExtracted, MemoryType
from agent_memory_server.llms import (
AnthropicClientWrapper,
BedrockClientWrapper,
OpenAIClientWrapper,
get_model_client,
)
from agent_memory_server.llm import LLMClient
from agent_memory_server.logging import get_logger
from agent_memory_server.models import MemoryRecord

Expand Down Expand Up @@ -128,15 +123,10 @@ def extract_entities(text: str) -> list[str]:
async def extract_topics_llm(
text: str,
num_topics: int | None = None,
client: OpenAIClientWrapper
| AnthropicClientWrapper
| BedrockClientWrapper
| None = None,
) -> list[str]:
"""
Extract topics from text using the LLM model.
"""
_client = client or await get_model_client(settings.topic_model)
_num_topics = num_topics if num_topics is not None else settings.top_k_topics

prompt = f"""
Expand All @@ -152,17 +142,15 @@ async def extract_topics_llm(

async for attempt in AsyncRetrying(stop=stop_after_attempt(3)):
with attempt:
response = await _client.create_chat_completion(
response = await LLMClient.create_chat_completion(
model=settings.generation_model,
prompt=prompt,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
)
try:
topics = json.loads(response.choices[0].message.content)["topics"]
topics = json.loads(response.content)["topics"]
except (json.JSONDecodeError, KeyError):
logger.error(
f"Error decoding JSON: {response.choices[0].message.content}"
)
logger.error(f"Error decoding JSON: {response.content}")
topics = []
if topics:
topics = topics[:_num_topics]
Expand Down
47 changes: 47 additions & 0 deletions agent_memory_server/llm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
LLM client package for unified LLM operations.

This package provides a single entry point for all LLM interactions,
abstracting away the underlying provider (OpenAI, Anthropic, Bedrock, etc.).

Usage:
from agent_memory_server.llm import LLMClient, ChatCompletionResponse

response = await LLMClient.create_chat_completion(
model="gpt-4o",
messages=[{"role": "user", "content": "Hello"}],
)
"""

from agent_memory_server.llm.client import (
LLMClient,
get_model_config,
optimize_query_for_vector_search,
)
from agent_memory_server.llm.exceptions import (
APIKeyMissingError,
LLMClientError,
ModelValidationError,
)
from agent_memory_server.llm.types import (
ChatCompletionResponse,
EmbeddingResponse,
LLMBackend,
)


__all__ = [
# Client
"LLMClient",
# Convenience functions
"get_model_config",
"optimize_query_for_vector_search",
# Exceptions
"LLMClientError",
"ModelValidationError",
"APIKeyMissingError",
# Types
"ChatCompletionResponse",
"EmbeddingResponse",
"LLMBackend",
]
Loading