-
Notifications
You must be signed in to change notification settings - Fork 3.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Model parameters don't get updated after upgrading from 1.1.4 to 2.0.7 #18346
Comments
@amifallingstars Thanks for reaching out. Unfortunately it is too hard for us to know what's up given this extremely brief description. If you have determined that the model weights don't get updated, I assume you mean the optimizer step doesn't get called? This could be the case if you return None from your training_step for some reason. Please note that for us to be able to work on this, we'd need some evidence that there is a bug in Lightning. The best is if you could provide the code example that reproduces your issue. |
@awaelchli Hi, thanks for answering my question. class Fegnet(L.LightningModule):
def __init__(self, _config, word_vectors):
super().__init__()
self.save_hyperparameters("_config")
self._config = _config
self.model = MyNet(_config, word_vectors)
self.table = []
utils.set_metrics(self)
def forward(self, inputs):
return self.model(inputs)
def training_step(self, batch, batch_idx):
inputs, records = self.process_batch(batch)
output = self(inputs)
self.logger.experiment.add_scalar("train/loss", output["loss"], self.global_step) if self.logger else None
self.compute_iou_and_update(output, records)
return output["loss"]
def on_training_epoch_end(self):
self.log_metrics()
def validation_step(self, batch, batch_idx):
inputs, records = self.process_batch(batch)
output = self(inputs)
self.logger.experiment.add_scalar("val/loss", output["loss"], self.global_step) if self.logger else None
self.compute_iou_and_update(output, records)
return output
def on_validation_epoch_end(self):
self.log_metrics()
def on_fit_end(self):
self.log_table()
return super().on_fit_end()
def process_batch(self, batch):
records, vfeats, vfeat_lens, word_ids, char_ids, s_labels, e_labels, h_labels = batch
inputs = edict()
inputs.vfeats = vfeats
inputs.h_labels = h_labels
inputs.s_labels, inputs.e_labels = s_labels, e_labels
inputs.word_ids, inputs.char_ids = word_ids, char_ids
# generate mask inputs
inputs.query_mask = (torch.zeros_like(inputs.word_ids) != inputs.word_ids).float()
inputs.video_mask = convert_length_to_mask(vfeat_lens)
return inputs, records
def compute_iou_and_update(self, output, records, track=None):
start_logits_list = output["start_logits_list"]
end_logits_list = output["end_logits_list"]
start_indices, end_indices = extract_index(start_logits_list[0], end_logits_list[0])
for record, start_index, end_index in zip(records, start_indices, end_indices):
start_time, end_time = index_to_time(start_index, end_index, record["v_len"], record["duration"])
iou = calculate_iou(i0=[start_time, end_time], i1=[record["s_time"], record["e_time"]])
utils.update_module_ious(self, iou, operation="update")
def log_metrics(self):
phase = "train" if self.training else "test"
r1i3, r1i5, r1i7, mIoU = utils.get_module_ious(self)
self.table.append([self.global_step, r1i7*100, r1i5*100, r1i3*100, mIoU*100])
self.log(f"{phase}/r1i7", r1i7)
self.logger.experiment.add_scalars(f"{phase}", {
"r1i7": r1i7*100,
"r1i5": r1i5*100,
"r1i3": r1i3*100,
"mIoU": mIoU*100
}, self.global_step) if self.logger else None
if not self.training:
print(f"\t\t\t\tgstep: {self.global_step}, r1i7: {r1i7*100:02.02f}, r1i5: {r1i5*100:02.02f}, r1i3: {r1i3*100:02.02f}, mIoU: {mIoU*100:02.02f}")
utils.update_module_ious(self, operation="reset")
def configure_optimizers(self):
return utils.set_schedule(self)
def log_table(self):
columns = ["gstep", "r1i7", "r1i5", "r1i3", "mIoU"]
data = sorted(self.table, key=lambda x:x[1], reverse=True)
# data = list(map(lambda x:[x[0], x[1]*100, x[2]*100, x[3]*100, x[4]*100], data))
self.logger.log_table(key="performance", columns=columns, data=data)
def log_text(self):
header = ["gstep", "r1i7", "r1i5", "r1i3", "mIoU"]
data = sorted(self.table, key=lambda x:x[1], reverse=True)
text = list_to_markdown_table(data, header)
self.logger.add_text('Final Performance', text, global_step=0)` and my Trainer is @ex.automain
def main(_config):
_config = copy.deepcopy(_config)
L.seed_everything(12345)
dm = eval(_config["data_module"])(_config)
_config["model"].update({"word_size": dm.data.n_words, "char_size": dm.data.n_chars})
model_params = edict()
model_params.update(_config)
model = Fegnet(model_params, dm.data.word_vector)
log_dir = os.path.join(_config["paths"]["log_dir"], _config["dataset"], _config["model"]["name"])
save_dir = os.path.join(log_dir, _config["exp_name"])
if os.path.exists(save_dir): rmtree(save_dir)
logger = TensorBoardLogger(save_dir=save_dir, version=_config["exp_name"]) if _config["use_logger"] else False
# Define the checkpoints for id and ood training
checkpoint_callback = ModelCheckpoint(
dirpath=os.path.join(save_dir, "cpt_id"),
# filename='checkpoint-ood-{global_step}-{test_ood/r1i7*100:.2f}',
save_top_k=3,
verbose=True,
monitor="test/r1i7",
mode="max",
)
print('='*71+'Config: '+'='*71)
pprint(_config)
print('='*150)
sys.stdout.flush()
trainer = L.Trainer(
devices=1,
max_epochs=_config["train"]["epochs"],
logger=logger,
gradient_clip_val=1.0,
val_check_interval=0.5,
precision=32,
callbacks=[checkpoint_callback]
)
trainer.fit(model, datamodule=dm) all my out put is the same with pytorch_lightning 2.0.7 |
What is this part? def configure_optimizers(self):
return utils.set_schedule(self) Is it returning a valid torch Optimizer that handles the closure correctly? |
def set_schedule(pl_module):
lr = pl_module.hparams._config.train.lr
wd = pl_module.hparams._config.train.decay
no_decay = [
"bias",
"layer_norm",
"LayerNorm",
]
optimizer_grouped_parameters = [
{
"params": [
p
for n, p in pl_module.named_parameters()
if not any(nd in n for nd in no_decay)
],
"weight_decay": wd,
},
{
"params": [
p
for n, p in pl_module.named_parameters()
if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
if pl_module.trainer.max_steps is None:
max_steps = (
len(pl_module.trainer.datamodule.train_dataloader())
* pl_module.trainer.max_epochs
)
else:
max_steps = pl_module.trainer.max_steps
# optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr, eps=1e-8, betas=(0.9, 0.98))
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=max_steps * pl_module.hparams._config.train.warmup_proportion,
num_training_steps=max_steps,
)
return ({
"optimizer": optimizer,
"lr_scheduler": {
"scheduler": scheduler,
},
}) |
@amifallingstars I can't spot anything suspicious in your code. |
@awaelchli Sure, I have printed the parameter from epoch to epoch, they didn't change at all. The loss is also the same. I really want to use the new features in version 2.x, but I can't beacuse of this problem. I run the code on linux with Nividia RTX A5000, Titan X, and V100, neither of them work. I also checked my cuda version and python verison which are both in the range of the offical documents. I have tried the demo in the offical site which trains an encoder and a decoder to fit the MINIST dataset, this loss goes down in a right manner with version 2.x. I don't know what can i do. T_T. |
I see, and does the learning rate scheduler assign a positive learning rate? Note that if the learning rate is set to 0, the parameters wouldn't change. As a sanity check, please remove the scheduler and just return def configure_optimizers(self):
return torch.optim.AdamW(self.parameters()) |
Yes, I just |
Can you check how From what you have posted earlier, it is clear that the issue comes from the optimizer and how it's parameters get managed.
Please understand that if you don't share the code, I won't be able to help much. The forum is better suited for implementation help questions. Here we are discussing bugs in the Lightning framework, and to pursue them we need some evidence that there is something wrong in Lightning. A screenshot is not sufficient. |
import torch
import random
from transformers.optimization import AdamW
from transformers import (
get_polynomial_decay_schedule_with_warmup,
get_cosine_schedule_with_warmup,
get_linear_schedule_with_warmup,
)
from .metrics import IoUThresholdPercentage, Scalar, MeanIoU
from project.models import objectives
def set_metrics(pl_module):
for split in ["train", "test"]:
setattr(pl_module, f"{split}_r1i3", IoUThresholdPercentage(threshold=0.3))
setattr(pl_module, f"{split}_r1i5", IoUThresholdPercentage(threshold=0.5))
setattr(pl_module, f"{split}_r1i7", IoUThresholdPercentage(threshold=0.7))
setattr(pl_module, f"{split}_mIoU", MeanIoU())
def update_module_ious(pl_module, iou=None, operation=None):
phase = "train" if pl_module.training else "test"
names = [f"{phase}_r1i3", f"{phase}_r1i5", f"{phase}_r1i7", f"{phase}_mIoU"]
for name in names:
if operation == "update":
assert iou is not None, "update an invalid iou value"
getattr(pl_module, name)(iou)
elif operation == "reset":
getattr(pl_module, name).reset()
return None
def get_module_ious(pl_module):
phase = "train" if pl_module.training else "test"
names = [f"{phase}_r1i3", f"{phase}_r1i5", f"{phase}_r1i7", f"{phase}_mIoU"]
return [getattr(pl_module, name).compute() for name in names]
def set_schedule(pl_module):
lr = pl_module.hparams._config.train.lr
# print(lr)
# raise
wd = pl_module.hparams._config.train.decay
no_decay = [
"bias",
"layer_norm",
"LayerNorm",
]
optimizer_grouped_parameters = [
{
"params": [
p
for n, p in pl_module.named_parameters()
if not any(nd in n for nd in no_decay)
],
"weight_decay": wd,
},
{
"params": [
p
for n, p in pl_module.named_parameters()
if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
# print(optimizer_grouped_parameters)
# raise
if pl_module.trainer.max_steps is None:
max_steps = (
len(pl_module.trainer.datamodule.train_dataloader())
* pl_module.trainer.max_epochs
)
else:
max_steps = pl_module.trainer.max_steps
optimizer = torch.optim.AdamW(pl_module.parameters(), lr=lr, eps=1e-8, betas=(0.9, 0.98))
# optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=max_steps * pl_module.hparams._config.train.warmup_proportion,
num_training_steps=max_steps,
)
# return optimizer
return ({
"optimizer": optimizer,
"lr_scheduler": {
"scheduler": scheduler,
},
})
|
zeros are the initialization of the linear bias def init_parameters(self):
def init_weights(m):
if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear):
torch.nn.init.xavier_uniform_(m.weight)
if m.bias is not None:
torch.nn.init.zeros_(m.bias)
elif isinstance(m, nn.LSTM):
m.reset_parameters()
self.apply(init_weights) |
Bug description
I have a code that can be trained in 1.4, but the model parameters are not updated in 2.0.7 version. What is the possible reason? thanks a lot.
What version are you seeing the problem on?
v2.0, master
How to reproduce the bug
No response
Error messages and logs
Environment
Current environment
More info
No response
The text was updated successfully, but these errors were encountered: