PrimeIntellect-ai · willccbb · Jan 30, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/docs/reference.md b/docs/reference.md
@@ -576,8 +576,6 @@ class EvalConfig(BaseModel):
     num_examples: int
     rollouts_per_example: int
     max_concurrent: int
-    max_concurrent_generation: int | None = None
-    max_concurrent_scoring: int | None = None
     independent_scoring: bool = False
     extra_env_kwargs: dict = {}
     max_retries: int = 0

diff --git a/pyproject.toml b/pyproject.toml
@@ -47,6 +47,8 @@ dependencies = [
     "typing_extensions; python_version < '3.12'",
     "wget>=3.2",
     "gepa",
+    "pyzmq>=27.1.0",
+    "msgpack>=1.1.2",
 ]
 
 [tool.uv.sources]

diff --git a/tests/test_env_group.py b/tests/test_env_group.py
@@ -9,7 +9,6 @@
 from verifiers import EnvGroup, Rubric, SingleTurnEnv
 from verifiers.envs.env_group import EnvGroupRubric
 from verifiers.types import State
-from verifiers.utils.async_utils import NullAsyncContext
 
 
 class TestEnvGroupRubric:
@@ -94,9 +93,8 @@ def func2(completion, **kwargs):
         state["oai_tools"] = []
         state["reward"] = None
         state["metrics"] = None
-        score_sem = NullAsyncContext()
 
-        await rubric.score_rollout(state, score_sem)
+        await rubric.score_rollout(state)
 
         assert "func1" in state["metrics"]
         assert "func2" in state["metrics"]
@@ -131,9 +129,8 @@ async def test_env_group_rubric_unknown_task(self, mock_openai_client, make_inpu
         state["oai_tools"] = []
         state["reward"] = None
         state["metrics"] = None
-        score_sem = NullAsyncContext()
 
-        await rubric.score_rollout(state, score_sem)
+        await rubric.score_rollout(state)
 
         assert state["reward"] == 0.0
 
@@ -381,7 +378,7 @@ async def test_env_group_generate(self, mock_openai_client, make_input):
         # Mock the scoring with a properly-typed cast
         from typing import cast
 
-        async def mock_score_group(states, score_sem=None):
+        async def mock_score_group(states):
             for state in states:
                 state["reward"] = 0.8 if state["task"] == "math" else 0.9
                 state["metrics"] = {}

diff --git a/tests/test_environment.py b/tests/test_environment.py
@@ -510,9 +510,8 @@ async def test_no_retry_after_non_retryable_error(
         rollout_outputs = outputs["outputs"]
         assert env.call_counts[0] == 1  # No retries for non-retryable error
         assert rollout_outputs[0].get("error") is not None
-        assert (
-            "ToolError" in rollout_outputs[0]["error"]
-        )  # Error is serialized as repr string
+        error_info = rollout_outputs[0]["error"]
+        assert "ToolError" == error_info["error"]
 
     @pytest.mark.asyncio
     async def test_error_in_state_after_max_retries_exhausted(
@@ -532,9 +531,8 @@ async def test_error_in_state_after_max_retries_exhausted(
         rollout_outputs = outputs["outputs"]
         assert env.call_counts[0] == 3  # 1 initial + 2 retries
         assert rollout_outputs[0].get("error") is not None
-        assert (
-            "InfraError" in rollout_outputs[0]["error"]
-        )  # Error is serialized as repr string
+        error_info = rollout_outputs[0]["error"]
+        assert "InfraError" == error_info["error"]
 
 
 class TestEmptyModelResponseErrors:

diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
@@ -34,8 +34,6 @@ def _run_cli(monkeypatch, overrides, capture_all_configs: bool = False):
             "num_examples": 1,
             "rollouts_per_example": 1,
             "max_concurrent": 1,
-            "max_concurrent_generation": None,
-            "max_concurrent_scoring": None,
             "independent_scoring": False,
             "max_tokens": 42,
             "temperature": 0.9,

diff --git a/tests/test_math_rubric.py b/tests/test_math_rubric.py
@@ -5,7 +5,6 @@
 import pytest
 
 import verifiers as vf
-from verifiers.utils.async_utils import NullAsyncContext
 
 
 class TestMathRubric:
@@ -55,9 +54,8 @@ async def test_score_valid_answers(self, test_case, make_input):
             "total_ms": 0.0,
             "start_time": 0.0,
         }
-        score_sem = NullAsyncContext()
 
-        await rubric.score_rollout(state, score_sem)
+        await rubric.score_rollout(state)
 
         assert state["metrics"]["correct_answer"] == 1.0
 
@@ -90,9 +88,8 @@ async def test_score_invalid_answers(self, test_case, make_input):
             "total_ms": 0.0,
             "start_time": 0.0,
         }
-        score_sem = NullAsyncContext()
 
-        await rubric.score_rollout(state, score_sem)
+        await rubric.score_rollout(state)
 
         assert state["metrics"]["correct_answer"] == 0.0
 
@@ -122,10 +119,9 @@ async def test_timeout(self, timeout_seconds, make_input):
             "total_ms": 0.0,
             "start_time": 0.0,
         }
-        score_sem = NullAsyncContext()
 
         start_time = time.time()
-        await rubric.score_rollout(state, score_sem)
+        await rubric.score_rollout(state)
         end_time = time.time()
         elapsed_time = end_time - start_time
         assert state["metrics"]["correct_answer"] == 0.0

diff --git a/tests/test_rubric.py b/tests/test_rubric.py
@@ -6,7 +6,6 @@
 
 from verifiers import Parser, Rubric
 from verifiers.types import RewardFunc, RolloutInput, State
-from verifiers.utils.async_utils import NullAsyncContext
 
 
 class TestRubric:
@@ -171,9 +170,8 @@ def func2(completion, **kwargs):
             "total_ms": 0.0,
             "start_time": 0.0,
         }
-        score_sem = NullAsyncContext()
 
-        await rubric.score_rollout(state, score_sem)
+        await rubric.score_rollout(state)
 
         assert "func1" in state["metrics"]
         assert "func2" in state["metrics"]
@@ -211,9 +209,8 @@ def list_func(completion, **kwargs):
             "total_ms": 0.0,
             "start_time": 0.0,
         }
-        score_sem = NullAsyncContext()
 
-        await rubric.score_rollout(state, score_sem)
+        await rubric.score_rollout(state)
 
         assert state["metrics"]["list_func"] == 2.0  # Length of completion list
         assert state["reward"] == 2.0
@@ -266,8 +263,7 @@ def length_func(completion, **kwargs):
                 "start_time": 0.0,
             }
 
-        score_sem = NullAsyncContext()
-        await rubric.score_group(states, score_sem)
+        await rubric.score_group(states)
 
         assert states[0]["metrics"]["accuracy_func"] == 1.0
         assert states[1]["metrics"]["accuracy_func"] == 1.0
@@ -304,9 +300,8 @@ def func2(completion, **kwargs):
             "total_ms": 0.0,
             "start_time": 0.0,
         }
-        score_sem = NullAsyncContext()
 
-        await rubric.score_group([state], score_sem)
+        await rubric.score_group([state])
 
         # Weighted sum: 1.0*2.0 + 0.5*3.0 = 3.5
         assert state["reward"] == pytest.approx(3.5)
@@ -319,10 +314,9 @@ def test_func(completion, **kwargs):
             return 1.0
 
         rubric = Rubric(funcs=[test_func], weights=[1.0])
-        score_sem = NullAsyncContext()
 
         # score_group with empty list should handle gracefully
-        await rubric.score_group([], score_sem)
+        await rubric.score_group([])
 
     @pytest.mark.asyncio
     async def test_score_rollouts_with_default_infos(self):
@@ -349,9 +343,8 @@ def simple_func(completion, **kwargs):
             "total_ms": 0.0,
             "start_time": 0.0,
         }
-        score_sem = NullAsyncContext()
 
-        await rubric.score_group([state], score_sem)
+        await rubric.score_group([state])
 
         assert "simple_func" in state["metrics"]
         assert state["metrics"]["simple_func"] == 1.0
@@ -388,9 +381,8 @@ def scalar_func(completion, **kwargs):
             "total_ms": 0.0,
             "start_time": 0.0,
         }
-        score_sem = NullAsyncContext()
 
-        await rubric.score_group([state], score_sem)
+        await rubric.score_group([state])
 
         assert state["metrics"]["scalar_func"] == 0.5
         assert state["reward"] == 0.5
@@ -425,9 +417,8 @@ def f_with_kwargs(completion, **kwargs):
             "total_ms": 0.0,
             "start_time": 0.0,
         }
-        score_sem = NullAsyncContext()
 
-        await rubric.score_rollout(state, score_sem)
+        await rubric.score_rollout(state)
 
         # Weighted sum: 0.5*1 + 1.0*2 = 2.5
         assert state["reward"] == pytest.approx(2.5)
@@ -464,9 +455,8 @@ def g2(**kwargs):
             "total_ms": 0.0,
             "start_time": 0.0,
         }
-        score_sem = NullAsyncContext()
 
-        await rubric.score_rollout(state, score_sem)
+        await rubric.score_rollout(state)
 
         assert state["reward"] == pytest.approx(0.5)
         assert calls == ["g1", "g2"]  # order respected

diff --git a/tests/test_rubric_group.py b/tests/test_rubric_group.py
@@ -4,7 +4,6 @@
 
 from verifiers import Rubric, RubricGroup, XMLParser
 from verifiers.types import RolloutInput, RolloutTiming, State
-from verifiers.utils.async_utils import NullAsyncContext
 
 
 class TestRubricGroup:
@@ -163,8 +162,7 @@ def func2(completion, **kwargs):
             start_time=0.0,
         )
 
-        score_sem = NullAsyncContext()
-        await group.score_group([state], score_sem)
+        await group.score_group([state])
 
         # Should have scores from both rubrics
         assert "func1" in state["metrics"]
@@ -203,8 +201,7 @@ def func1(completion, **kwargs):
             start_time=0.0,
         )
 
-        score_sem = NullAsyncContext()
-        await group.score_group([state], score_sem)
+        await group.score_group([state])
 
         # Should have summed scores for duplicate function names
         assert "func1" in state["metrics"]
@@ -244,8 +241,7 @@ def func1(completion, info=None, **kwargs):
             start_time=0.0,
         )
 
-        score_sem = NullAsyncContext()
-        await group.score_group([state], score_sem)
+        await group.score_group([state])
 
         # Should pass custom kwargs to reward functions
         assert "func1" in state["metrics"]
@@ -282,8 +278,7 @@ def func1(completion, **kwargs):
             start_time=0.0,
         )
 
-        score_sem = NullAsyncContext()
-        await group.score_group([state], score_sem)
+        await group.score_group([state])
 
         # Should work with single rubric
         assert "func1" in state["metrics"]
@@ -302,10 +297,9 @@ def func1(completion, **kwargs):
 
         # Test with empty data - should handle gracefully
         states = []
-        score_sem = NullAsyncContext()
         # Empty states should not cause errors
         try:
-            await group.score_group(states, score_sem)
+            await group.score_group(states)
         except ZeroDivisionError:
             pytest.skip("score_group doesn't handle empty states yet")
 
@@ -368,8 +362,7 @@ def func1(completion, **kwargs):
                 start_time=0.0,
             )
 
-        score_sem = NullAsyncContext()
-        await group.score_group(states, score_sem)
+        await group.score_group(states)
 
         # Should work with multiple states
         assert "func1" in states[0]["metrics"]
@@ -420,8 +413,7 @@ def reward_func(completion, parser, answer, **_):
             start_time=0.0,
         )
 
-        score_sem = NullAsyncContext()
-        await group.score_rollout(state, score_sem)
+        await group.score_rollout(state)
 
         assert state["reward"] == 1.0
         assert recorded_parsers == [xml_parser]
diff --git a/verifiers/envs/env_group.py b/verifiers/envs/env_group.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import time
-from typing import TYPE_CHECKING, AsyncContextManager, Mapping, final
+from typing import TYPE_CHECKING, Mapping, final
 
 from openai import AsyncOpenAI
 
@@ -38,7 +38,6 @@ def _get_reward_func_names(self) -> list[str]:
     async def score_rollout(
         self,
         state: vf.State,
-        score_sem: AsyncContextManager,
     ) -> None:
         """
         Evaluate all reward functions in-place for a single rollout.
@@ -57,7 +56,7 @@ async def score_rollout(
             state["metrics"] = metrics
             return
 
-        await env.rubric.score_rollout(state, score_sem=score_sem)
+        await env.rubric.score_rollout(state)
         env_reward = state.get("reward", 0.0)
         env_metrics = state.get("metrics", {}).copy() if state.get("metrics") else {}
 
@@ -72,7 +71,6 @@ async def score_rollout(
     async def score_group(
         self,
         states: list[vf.State],
-        score_sem: AsyncContextManager,
     ) -> None:
         """
         Score a group of rollouts, routing to appropriate environment rubrics based on task.
@@ -95,7 +93,7 @@ async def score_group(
             return
 
         # Score all states using the environment's rubric
-        await env.rubric.score_group(states, score_sem=score_sem)
+        await env.rubric.score_group(states)
 
         # Initialize metrics dict with all reward function names
         aggregated_metrics: dict[str, list[float]] = {