Skip to content

Commit

Permalink
Small stability and logging fixes (#522)
Browse files Browse the repository at this point in the history
This concludes the important changes I have on the SIGMOD branch. These
changes are too small to open more PRs for them, so I grouped them in a
single PR.
  • Loading branch information
MaxiBoether authored Jun 18, 2024
1 parent ebac30d commit 75868bb
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 4 deletions.
2 changes: 1 addition & 1 deletion modyn/evaluator/internal/grpc/evaluator_grpc_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(self, modyn_config: dict, tempdir: pathlib.Path) -> None:
self.tempdir = tempdir
self.server = grpc.server(
futures.ThreadPoolExecutor(
max_workers=10,
max_workers=64,
),
options=[
("grpc.max_receive_message_length", MAX_MESSAGE_SIZE),
Expand Down
2 changes: 1 addition & 1 deletion modyn/evaluator/internal/pytorch_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def _prepare_dataloader(self, evaluation_info: EvaluationInfo) -> torch.utils.da
)
self._debug("Creating DataLoader.")
dataloader = torch.utils.data.DataLoader(
dataset, batch_size=evaluation_info.batch_size, num_workers=evaluation_info.num_dataloaders
dataset, batch_size=evaluation_info.batch_size, num_workers=evaluation_info.num_dataloaders, timeout=60
)

return dataloader
Expand Down
3 changes: 2 additions & 1 deletion modyn/supervisor/internal/supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,8 @@ def start_pipeline(
start_timestamp = self.grpc.get_time_at_storage()
pipeline_id = self.register_pipeline(pipeline_config_model)
logger.info(f"Pipeline {pipeline_id} registered, start executing.")
except Exception: # pylint: disable=broad-except
except Exception as ex: # pylint: disable=broad-except
logger.error(f"Failed to register pipeline: {ex}")
return pipeline_res_msg(exception="Failed to register pipeline")

try:
Expand Down
2 changes: 1 addition & 1 deletion modyn/trainer_server/internal/dataset/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def prepare_dataloaders(
)
logger.debug("Creating DataLoader.")
train_dataloader = torch.utils.data.DataLoader(
train_set, batch_size=batch_size, num_workers=num_dataloaders, drop_last=drop_last
train_set, batch_size=batch_size, num_workers=num_dataloaders, drop_last=drop_last, timeout=60
)

# TODO(#50): what to do with the val set in the general case?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,12 @@ def get_num_data_partitions(self) -> int:

def end_of_trigger_cleaning(self) -> None:
self._trigger_sample_storage.clean_trigger_data(self._pipeline_id, self._trigger_id)

def __getstate__(self):
state = self.__dict__.copy()
del state["_trigger_sample_storage"] # not pickable
return state

def __setstate__(self, state):
self.__dict__.update(state)
self._trigger_sample_storage = TriggerSampleStorage(self.offline_dataset_path)

0 comments on commit 75868bb

Please sign in to comment.