diff --git a/configs/experimental/roberta/roberta.oscar.nofilter.wechsel.largedict.sophia.json b/configs/experimental/roberta/roberta.oscar.nofilter.wechsel.largedict.sophia.json new file mode 100644 index 0000000..daa1446 --- /dev/null +++ b/configs/experimental/roberta/roberta.oscar.nofilter.wechsel.largedict.sophia.json @@ -0,0 +1,30 @@ +{ + "output_dir": "exps/roberta.oscar.nofilter.wechsel.largedict.sophia", + "model_type": "roberta", + "config_name": "exps/roberta.oscar.nofilter.wechsel.largedict.sophia", + "tokenizer_name": "exps/roberta.oscar.nofilter.wechsel.largedict.sophia", + "train_file": "data/oscar.nofilter", + "max_seq_length": 512, + "do_train": true, + "do_eval": true, + "validation_file": "data/bruk_valid_data.txt", + "weight_decay": 1e-1, + "per_device_train_batch_size": 24, + "per_device_eval_batch_size": 24, + "gradient_accumulation_steps": 4, + "learning_rate": 2e-4, + "evaluation_strategy": "steps", + "warmup_steps": 25000, + "max_steps": 250000, + "eval_steps": 500, + "save_steps": 12500, + "logging_steps": 500, + "overwrite_output_dir": true, + "num_train_epochs": 0, + "adam_beta1": 0.965, + "adam_beta2": 0.99, + "adam_epsilon": 1e-6, + "preprocessing_num_workers": 48, + "fp16": true, + "model_name_or_path": "exps/roberta.oscar.nofilter.wechsel.largedict.sophia" +} \ No newline at end of file diff --git a/run_mlm.py b/run_mlm.py index b9af68a..e7b3040 100644 --- a/run_mlm.py +++ b/run_mlm.py @@ -180,6 +180,11 @@ class DataTrainingArguments: metadata={"help": "The name of the project to which the training run will belong on Weights & Biases."} ) + optimizer: Optional[str] = field( + default="adam", + metadata={"help": "The optimizer to use for training."} + ) + class DataCollatorForLanguageModeling(transformers.DataCollatorForLanguageModeling): def torch_mask_tokens(self, inputs, special_tokens_mask = None): @@ -503,17 +508,40 @@ def compute_metrics(eval_preds): wandb.config.update(data_args) wandb.save(__file__, policy="now") - # Initialize our Trainer - trainer = Trainer( - model=model, - args=training_args, - train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=eval_dataset if training_args.do_eval else None, - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics if training_args.do_eval else None, - preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None, - ) + if data_args.optimizer == "adam": + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics if training_args.do_eval else None, + preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None, + ) + elif data_args.optimizer == "sophia": + from optimizers.sophia import SophiaG + optimizer = SophiaG( + filter(lambda p: p.requires_grad, model.parameters()), + lr=training_args.learning_rate, + weight_decay=training_args.weight_decay, + betas=(training_args.adam_beta1, training_args.adam_beta2), + ) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics if training_args.do_eval else None, + preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None, + optimizers=(optimizer, None), + ) + else: + raise ValueError("Optimizer not supported") # Training if training_args.do_train: