Skip to content

Commit

Permalink
chore: remove e2e_slurm_gpu series tests (#10021)
Browse files Browse the repository at this point in the history
Note that there are nightly tests decorated with:

   - @e2e_slurm
   - skipif(not torch.cuda.is_available())

So we still have some GPU-specific slurm tests at this point.  But those
tests were not actually running as part of the e2e_slurm_gpu tests
anyway.

This is part of a larger effort to get rid of our znode tests, which
are notoriously unreliable.
  • Loading branch information
rb-determined-ai authored Oct 10, 2024
1 parent 1f7ccad commit 2594d90
Show file tree
Hide file tree
Showing 13 changed files with 82 additions and 200 deletions.
22 changes: 0 additions & 22 deletions .circleci/real_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4208,18 +4208,6 @@ workflows:
security:
initial_user_password: ${INITIAL_USER_PASSWORD}
- test-e2e-slurm:
name: test-e2e-slurm-gpu
mark: "e2e_slurm_gpu"
requires:
- package-and-push-system-local-ee
context:
- dev-ci-cluster-default-user-credentials
filters:
branches:
only:
- main

# Singularity over SLURM test on GCP
- test-e2e-hpc-gcp:
context:
Expand Down Expand Up @@ -5106,16 +5094,6 @@ workflows:
extra-pytest-flags: "--no-compare-stats"
collect-det-job-logs: false

- test-e2e-slurm:
name: test-e2e-slurm-gpu
context:
- dev-ci-cluster-default-user-credentials
filters: *upstream-feature-branch
mark: "e2e_slurm_gpu"
requires:
- package-and-push-system-local-ee
- request-hpc-tests

- test-e2e-slurm:
name: test-e2e-slurm-enroot-znode
context:
Expand Down
1 change: 0 additions & 1 deletion e2e_tests/pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ markers =
e2e_pbs: end to end pbs integration tests
e2e_saml: tests for saml with okta
e2e_slurm: end to end slurm integration tests
e2e_slurm_gpu: end to end slurm GPU tests
e2e_slurm_restart: slurm integration tests that require restarting the master
e2e_slurm_preemption: hpc integration test to ensure preemption is working
e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access
Expand Down
2 changes: 1 addition & 1 deletion e2e_tests/tests/cluster/test_checkpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def wait_for_gc_to_finish(sess: api.Session, experiment_ids: List[int]) -> None:


@pytest.mark.e2e_gpu
@pytest.mark.e2e_slurm_gpu
def test_set_gc_policy() -> None:
sess = api_utils.user_session()
save_exp_best = 3
Expand Down Expand Up @@ -121,6 +120,7 @@ def test_gc_checkpoints_lfs() -> None:


@pytest.mark.e2e_cpu
@pytest.mark.e2e_slurm
def test_delete_checkpoints() -> None:
sess = api_utils.user_session()
config = {
Expand Down
1 change: 0 additions & 1 deletion e2e_tests/tests/experiment/test_profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@


@pytest.mark.e2e_gpu
@pytest.mark.e2e_slurm_gpu
@pytest.mark.timeout(30 * 60)
def test_streaming_observability_metrics_apis() -> None:
sess = api_utils.user_session()
Expand Down
64 changes: 0 additions & 64 deletions e2e_tests/tests/experiment/test_pytorch.py

This file was deleted.

Empty file.
24 changes: 0 additions & 24 deletions e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml

This file was deleted.

79 changes: 0 additions & 79 deletions e2e_tests/tests/fixtures/pytorch_identity/model_def.py

This file was deleted.

3 changes: 1 addition & 2 deletions e2e_tests/tests/nightly/test_pytorch2.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,13 @@

@pytest.mark.distributed
@pytest.mark.gpu_required
@pytest.mark.e2e_slurm_gpu
def test_pytorch2_hf_language_modeling_distributed() -> None:
sess = api_utils.user_session()
test_dir = "hf_language_modeling"

config = conf.load_config(conf.hf_trainer_examples_path(f"{test_dir}/distributed.yaml"))
config = conf.set_pt2_image(config)
config = conf.set_slots_per_trial(config, 4)
config = conf.set_slots_per_trial(config, 8)

# Our hardware GPUs have only 16gb memory, lower memory use with smaller batches.
config = conf.set_entrypoint(
Expand Down

This file was deleted.

This file was deleted.

79 changes: 79 additions & 0 deletions harness/tests/experiment/fixtures/pytorch_identity/model_def.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from typing import Any, Dict, Tuple

import torch.utils.data

from determined import pytorch


class MetricsCallback(pytorch.PyTorchCallback):
def __init__(self):
self.validation_metrics = []

def on_validation_end(self, metrics: Dict[str, Any]) -> None:
self.validation_metrics.append(metrics)


class IdentityDataset(torch.utils.data.Dataset):
def __init__(self, initial_value: int = 1):
self.initial_value = initial_value

def __len__(self) -> int:
return 64

def __getitem__(self, index: int) -> Tuple:
v = float(self.initial_value + 0.1 * index)
return torch.Tensor([v]), torch.Tensor([v])


class IdentityPyTorchTrial(pytorch.PyTorchTrial):
def __init__(self, context: pytorch.PyTorchTrialContext) -> None:
self.context = context

model = torch.nn.Linear(1, 1, False)
model.weight.data.fill_(0)
self.model = context.wrap_model(model)

self.lr = 0.001

optimizer = torch.optim.SGD(self.model.parameters(), self.lr)
self.opt = context.wrap_optimizer(optimizer)

self.loss_fn = torch.nn.MSELoss(reduction="mean")
self.metrics_callback = MetricsCallback()

def train_batch(
self, batch: pytorch.TorchData, epoch_idx: int, batch_idx: int
) -> Dict[str, torch.Tensor]:
data, label = batch

loss = self.loss_fn(self.model(data), label)

self.context.backward(loss)

self.context.step_optimizer(self.opt)

return {
"loss": loss,
}

def evaluate_batch(self, batch: pytorch.TorchData) -> Dict[str, Any]:
data, label = batch

loss = self.loss_fn(self.model(data), label)

weight = self.model.weight.data.item()

return {"val_loss": loss, "weight": weight}

def build_training_data_loader(self) -> pytorch.DataLoader:
return pytorch.DataLoader(
IdentityDataset(), batch_size=self.context.get_per_slot_batch_size()
)

def build_validation_data_loader(self) -> pytorch.DataLoader:
return pytorch.DataLoader(
IdentityDataset(20), batch_size=self.context.get_per_slot_batch_size()
)

def build_callbacks(self) -> Dict[str, pytorch.PyTorchCallback]:
return {"metrics": self.metrics_callback}
4 changes: 1 addition & 3 deletions harness/tests/experiment/pytorch/test_pytorch_trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -1468,16 +1468,14 @@ def amp_metrics_test(trial_class, training_metrics, agg_freq=1):
def run_identity(tmp_path: pathlib.Path):
checkpoint_dir = str(tmp_path.joinpath("checkpoint"))

config = utils.load_config(utils.fixtures_path("pytorch_identity/distributed.yaml"))
hparams = config["hyperparameters"]
hparams = {"global_batch_size": 4}

exp_config = utils.make_default_exp_config(
hparams,
scheduling_unit=1,
searcher_metric="validation_loss",
checkpoint_dir=checkpoint_dir,
)
exp_config.update(config)
exp_config["searcher"]["smaller_is_better"] = True

# each subprocess must import separately as trial_class cannot be pickled.
Expand Down
1 change: 0 additions & 1 deletion tools/slurm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ By default, the `test-e2e-*-gcp` jobs are not run within the `test-e2e` workflow
**On branch `main` and `release/rc` branches, these jobs always run without needing to set the `ci-run-allgcp` label.**
The following test suites currently run only on hardware. They do not run successfully with `make slurmcluster` and thus are not executed via GCP as part of the CI/CD gate:
- `test-e2e-slurm-gpu`: Test is skipped because the compute instance that the tests run on do not have any GPUs.
- `test-e2e-slurm-misconfigured`: This test could be made to work, but requires passing in a misconfigured `master.yaml` to the launcher on GCP, which could be tedious.
- `test-e2e-slurm-preemption-quarantine`: Currently runs on znode as a part of the nightly test suite.
- `test-e2e-slurm-restart`: Dependent upon znode configuration, so not worth testing on GCP.
Expand Down

0 comments on commit 2594d90

Please sign in to comment.