Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding another way of calculating gradients in downsamplers #540

Merged
merged 18 commits into from
Jun 25, 2024
Merged
Prev Previous commit
Next Next commit
add more tests
  • Loading branch information
XianzheMa committed Jun 25, 2024
commit 21324b29b4ed9620623aa66281562e73d5bef068
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# pylint: disable=abstract-class-instantiated,unused-argument
from unittest.mock import patch

import pytest
import numpy as np
import torch
from modyn.config import ModynConfig
Expand All @@ -10,7 +11,9 @@
)


def get_sampler_config(dummy_system_config: ModynConfig, balance=False):
def get_sampler_config(
dummy_system_config: ModynConfig, balance=False, matrix_content=MatrixContent.LAST_TWO_LAYERS_GRADIENTS
):
downsampling_ratio = 50
per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")

Expand All @@ -29,7 +32,7 @@ def get_sampler_config(dummy_system_config: ModynConfig, balance=False):
dummy_system_config.model_dump(by_alias=True),
per_sample_loss_fct,
"cpu",
MatrixContent.LAST_TWO_LAYERS_GRADIENTS,
matrix_content,
)


Expand Down Expand Up @@ -106,9 +109,12 @@ def test_collect_embedding_balance(test_amds, dummy_system_config: ModynConfig):
assert amds.already_selected_samples == [1, 3, 1000, 1002]


@pytest.mark.parametrize(
"matrix_content", [MatrixContent.LAST_LAYER_GRADIENTS, MatrixContent.LAST_TWO_LAYERS_GRADIENTS]
)
@patch.multiple(AbstractMatrixDownsamplingStrategy, __abstractmethods__=set())
def test_collect_gradients(dummy_system_config: ModynConfig):
amds = AbstractMatrixDownsamplingStrategy(*get_sampler_config(dummy_system_config))
def test_collect_gradients(matrix_content, dummy_system_config: ModynConfig):
amds = AbstractMatrixDownsamplingStrategy(*get_sampler_config(dummy_system_config, matrix_content=matrix_content))
with torch.inference_mode(mode=(not amds.requires_grad)):
forward_input = torch.randn((4, 5))
first_output = torch.randn((4, 2))
Expand All @@ -125,29 +131,14 @@ def test_collect_gradients(dummy_system_config: ModynConfig):

assert len(amds.matrix_elements) == 2

# expected shape = (a,b)
# expected shape = (a, gradient_shape)
# a = 7 (4 samples in the first batch and 3 samples in the second batch)
# b = 5 * 2 + 2 where 5 is the input dimension of the last layer and 2 is the output one
assert np.concatenate(amds.matrix_elements).shape == (7, 12)
if matrix_content == MatrixContent.LAST_LAYER_GRADIENTS:
# shape same as the last dimension of output
gradient_shape = 2
else:
# 5 is the input dimension of the last layer and 2 is the output one
gradient_shape = 5 * 2 + 2
assert np.concatenate(amds.matrix_elements).shape == (7, gradient_shape)

assert amds.index_sampleid_map == [1, 2, 3, 4, 21, 31, 41]


@patch(
"modyn.trainer_server.internal.trainer.remote_downsamplers"
".abstract_matrix_downsampling_strategy.torch.autograd.grad",
wraps=torch.autograd.grad,
)
def test__compute_last_layer_gradient_wrt_loss_sum(mock_torch_auto_grad):
per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
forward_output = torch.randn((4, 2), requires_grad=True)
# random target
target = torch.randint(0, 2, (4,))
last_layer_gradients = AbstractMatrixDownsamplingStrategy._compute_last_layer_gradient_wrt_loss_sum(
per_sample_loss_fct, forward_output, target
)
# as we use CrossEntropyLoss, the gradient is computed in a closed form
assert mock_torch_auto_grad.call_count == 0
# verify that the gradients calculated via the closed form are equal to the ones calculated by autograd
expected_grad = torch.autograd.grad(per_sample_loss_fct(forward_output, target).sum(), forward_output)[0]
assert torch.allclose(last_layer_gradients, expected_grad)
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# pylint: disable=abstract-class-instantiated
from unittest.mock import patch

import torch
from modyn.config import ModynConfig
from modyn.trainer_server.internal.trainer.remote_downsamplers.abstract_remote_downsampling_strategy import (
AbstractRemoteDownsamplingStrategy,
Expand All @@ -21,3 +22,23 @@ def test_batch_then_sample_general(dummy_system_config: ModynConfig):
assert sampler.trigger_id == 128
assert sampler.pipeline_id == 154
assert sampler.batch_size == 64


@patch(
"modyn.trainer_server.internal.trainer.remote_downsamplers"
".abstract_remote_downsampling_strategy.torch.autograd.grad",
wraps=torch.autograd.grad,
)
def test__compute_last_layer_gradient_wrt_loss_sum(mock_torch_auto_grad):
per_sample_loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
forward_output = torch.randn((4, 2), requires_grad=True)
# random target
target = torch.randint(0, 2, (4,))
last_layer_gradients = AbstractRemoteDownsamplingStrategy._compute_last_layer_gradient_wrt_loss_sum(
per_sample_loss_fct, forward_output, target
)
# as we use CrossEntropyLoss, the gradient is computed in a closed form
assert mock_torch_auto_grad.call_count == 0
# verify that the gradients calculated via the closed form are equal to the ones calculated by autograd
expected_grad = torch.autograd.grad(per_sample_loss_fct(forward_output, target).sum(), forward_output)[0]
assert torch.allclose(last_layer_gradients, expected_grad)
Loading