Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions environments/wiki_search/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,7 @@ Notes:
| ToolRubric metrics | Tool execution success and format adherence |
| JudgeRubric metrics | Judge-scored answer quality |

### Changelog

#### v0.1.22 (Jan 22, 2026)
- Make ChromaDB initialization lazy to allow multiple env instances to run concurrently
2 changes: 1 addition & 1 deletion environments/wiki_search/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "wiki-search"
description = "Agentic RAG over Wikipedia pages for trivia Q&A"
tags = ["wikipedia", "multi-turn", "agentic-search", "rag", "train", "eval", "llm-judge"]
requires-python = ">=3.11"
version = "0.1.21"
version = "0.1.22"
dependencies = [
"verifiers>=0.1.8",
"chromadb",
Expand Down
36 changes: 20 additions & 16 deletions environments/wiki_search/wiki_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,6 @@ def load_environment(
corpus_split: str = "train",
chroma_db_dir: str = CHROMA_DB_DIR,
) -> vf.Environment:
# ensure Chroma server is running in client/server mode
# ensure_chroma_server(chroma_db_dir)
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
model_name=embed_model,
api_base=embed_base_url,
api_key=os.getenv(embed_api_key_var, "EMPTY"),
)
client = chromadb.PersistentClient(path=chroma_db_dir)
collection = client.get_or_create_collection(
name="wiki_titles",
embedding_function=cast(EmbeddingFunction[Embeddable], openai_ef),
)
# load corpus into memory and build page_id -> row index
corpus = load_dataset(corpus_dataset, split=corpus_split)
page_id_to_title: dict[str, str] = {}
Expand All @@ -58,8 +46,25 @@ def load_environment(
page_id_to_title[pid] = title
page_id_to_content[pid] = content

# initialize chroma collection
def init_chroma() -> None:
# lazy chroma initialization (once across all env instances)
_chroma_state: dict = {"collection": None}

def _get_collection():
if _chroma_state["collection"] is None:
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
model_name=embed_model,
api_base=embed_base_url,
api_key=os.getenv(embed_api_key_var, "EMPTY"),
)
client = chromadb.PersistentClient(path=chroma_db_dir)
_chroma_state["collection"] = client.get_or_create_collection(
name="wiki_titles",
embedding_function=cast(EmbeddingFunction[Embeddable], openai_ef),
)
_init_chroma(_chroma_state["collection"])
return _chroma_state["collection"]

def _init_chroma(collection) -> None:
# upsert missing pages
all_ids = list(page_id_to_title.keys())
existing: set[str] = set()
Expand All @@ -85,8 +90,6 @@ def init_chroma() -> None:
metadatas=metadatas[i : i + bs],
)

init_chroma()

# helper function to normalize section ids
def normalize_id(text: str) -> str:
"""Normalize free text into an id: lowercased with spaces as underscores.
Expand All @@ -108,6 +111,7 @@ async def search_pages(query: str) -> list[dict]:
example:
"basketball" -> [{"page_id": "basketball", "title": "Basketball"}, {"page_id": "basketball_rules", "title": "Basketball Rules"}, ...]
"""
collection = _get_collection() # lazy init on first earch call
async with _get_chroma_semaphore():
results = await asyncio.to_thread(
collection.query, query_texts=[query], n_results=10
Expand Down
Loading