-
Notifications
You must be signed in to change notification settings - Fork 646
Description
when i run 2048.ipynb:
TimeoutError Traceback (most recent call last)
Cell In[4], line 50
42 backend = LocalBackend(
43 # Normally we don't want to run the server in-process, but for the output
44 # to show up properly on Google Colab we'll enable this.
45 in_process=True,
46 path="./.art",
47 )
49 # Register the model with the local Backend (sets up logging, inference, and training)
---> 50 await model.register(backend)
File /opt/miniconda/envs/art/lib/python3.11/site-packages/art/model.py:307, in TrainableModel.register(self, backend, _openai_client_config)
301 async def register(
302 self,
303 backend: "Backend",
304 _openai_client_config: dev.OpenAIServerConfig | None = None,
305 ) -> None:
306 await super().register(backend)
--> 307 base_url, api_key = await backend._prepare_backend_for_training(
308 self, _openai_client_config
309 )
311 # Populate the top-level inference fields so that the rest of the
312 # code (and any user code) can create an OpenAI client immediately.
313 self.inference_base_url = base_url
File /opt/miniconda/envs/art/lib/python3.11/site-packages/art/local/backend.py:255, in LocalBackend._prepare_backend_for_training(self, model, config)
249 async def _prepare_backend_for_training(
250 self,
251 model: TrainableModel,
252 config: dev.OpenAIServerConfig | None = None,
253 ) -> tuple[str, str]:
254 service = await self._get_service(model)
--> 255 await service.start_openai_server(config=config)
256 server_args = (config or {}).get("server_args", {})
258 base_url = f"http://{server_args.get('host', '0.0.0.0')}:{server_args.get('port', 8000)}/v1"
File /opt/miniconda/envs/art/lib/python3.11/site-packages/art/torchtune/service.py:32, in TorchtuneService.start_openai_server(self, config)
31 async def start_openai_server(self, config: dev.OpenAIServerConfig | None) -> None:
---> 32 await openai_server_task(
33 engine=await self.llm,
34 config=dev.get_openai_server_config(
35 model_name=self.model_name,
36 base_model=self.get_last_checkpoint_dir() or self.base_model,
37 log_file=f"{self.output_dir}/logs/vllm.log",
38 config=config,
39 ),
40 )
File /opt/miniconda/envs/art/lib/python3.11/site-packages/art/vllm/server.py:81, in openai_server_task(engine, config)
75 done, _ = await asyncio.wait(
76 [openai_server_task, test_client_task],
77 timeout=timeout,
78 return_when="FIRST_COMPLETED",
79 )
80 if not done:
---> 81 raise TimeoutError(
82 f"Unable to reach OpenAI-compatible server within {timeout} seconds. You can increase this timeout by setting the ART_SERVER_TIMEOUT environment variable."
83 )
84 for task in done:
85 task.result()
TimeoutError: Unable to reach OpenAI-compatible server within 1000.0 seconds. You can increase this timeout by setting the ART_SERVER_TIMEOUT environment variable.