Skip to content

Commit 87a6e33

Browse files
authored
Add files via upload
1 parent f131c29 commit 87a6e33

File tree

4 files changed

+179
-0
lines changed

4 files changed

+179
-0
lines changed

ML tips/NLP/reward_model/__init__.py

Whitespace-only changes.
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import torch
2+
from torch import nn
3+
from transformers import GPT2Model
4+
5+
class RewardModel(nn.Module):
6+
"""
7+
Using GPT for reward model like in instructGPT
8+
"""
9+
def __init__(self, base_model: GPT2Model):
10+
super().__init__()
11+
self.model = base_model
12+
self.reward_head = nn.Linear(base_model.config.hidden_size, 1, bias=False)
13+
14+
def forward(self, input_ids, attention_mask):
15+
out = self.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state # bs, seq_len, hid_dim
16+
logits = self.reward_head(out[:, -1, :]).squeeze(-1) # bs
17+
return logits
18+
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
from transformers import Trainer, PreTrainedModel
2+
from rlhf.data.data import RewardDataCollatorWithPadding
3+
from torch import nn
4+
from rlhf.optimizer.lion import DecoupledLionW
5+
from transformers.trainer_pt_utils import get_parameter_names
6+
import torch
7+
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
8+
9+
10+
class RewardTrainer(Trainer):
11+
def __init__(
12+
self,
13+
model: PreTrainedModel,
14+
args,
15+
data_collator = None,
16+
train_dataset = None,
17+
eval_dataset = None,
18+
tokenizer = None,
19+
model_init = None,
20+
compute_metrics = None,
21+
callbacks = None,
22+
optimizers = (None, None),
23+
preprocess_logits_for_metrics = None,
24+
max_length: int = 512,
25+
use_lion: bool = False
26+
):
27+
# data_collator = RewardDataCollatorWithPadding(tokenizer, max_length=max_length)
28+
super().__init__(
29+
model=model,
30+
args=args,
31+
data_collator=data_collator,
32+
train_dataset=train_dataset,
33+
eval_dataset=eval_dataset,
34+
tokenizer=tokenizer,
35+
model_init=model_init,
36+
compute_metrics=compute_metrics,
37+
callbacks=callbacks,
38+
optimizers=optimizers,
39+
preprocess_logits_for_metrics=preprocess_logits_for_metrics,
40+
)
41+
self.use_lion = use_lion
42+
43+
def compute_loss(self, model, inputs):
44+
rewards_chosen = model(input_ids=inputs["input_ids_chosen"], attention_mask=inputs["attention_mask_chosen"])
45+
rewards_rejected = model(
46+
input_ids=inputs["input_ids_rejected"], attention_mask=inputs["attention_mask_rejected"]
47+
)
48+
loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected).mean()
49+
return loss
50+
51+
52+
def create_optimizer(self):
53+
"""
54+
Setup the optimizer.
55+
56+
We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
57+
Trainer's init through `optimizers`, or subclass and override this method in a subclass.
58+
"""
59+
opt_model = self.model
60+
61+
decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
62+
decay_parameters = [name for name in decay_parameters if "bias" not in name]
63+
optimizer_grouped_parameters = [
64+
{
65+
"params": [
66+
p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
67+
],
68+
"weight_decay": self.args.weight_decay,
69+
},
70+
{
71+
"params": [
72+
p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
73+
],
74+
"weight_decay": 0.0,
75+
},
76+
]
77+
78+
# lion or adam
79+
if not self.use_lion:
80+
optimizer_cls = torch.optim.AdamW
81+
optimizer_kwargs = {
82+
"lr": self.args.learning_rate,
83+
"betas": (self.args.adam_beta1, self.args.adam_beta2),
84+
"eps": self.args.adam_epsilon,
85+
}
86+
else:
87+
optimizer_cls = DecoupledLionW
88+
optimizer_kwargs = {
89+
"lr": self.args.learning_rate,
90+
"betas": (self.args.adam_beta1, self.args.adam_beta2),
91+
}
92+
93+
94+
self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
95+
96+
97+
# print(f"Using optimizer {self.optimizer}")
98+
99+
return self.optimizer
100+

ML tips/NLP/reward_model/train_rm.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from transformers import HfArgumentParser, TrainingArguments
2+
from transformers import AutoTokenizer, GPT2Model
3+
from datasets import load_dataset
4+
5+
from dataclasses import dataclass, field
6+
from rlhf.reward_model.reward_trainer import RewardTrainer
7+
from rlhf.reward_model.reward_model import RewardModel
8+
from rlhf.data.data import RewardDataCollatorWithPadding, preprocess_function
9+
10+
11+
@dataclass
12+
class RMTrainerArguments(TrainingArguments):
13+
"""
14+
Adapt transformers.TrainingArguments
15+
"""
16+
use_lion: bool = field(
17+
default=False,
18+
metadata={
19+
"help": (
20+
"Will use the LION optimizer instead of AdamW "
21+
"uses same hyperparam args (betas, wd, lr) as AdamW, except eps)."
22+
)
23+
},
24+
)
25+
def main():
26+
parser = HfArgumentParser(RMTrainerArguments)
27+
28+
trainer_args = parser.parse_args_into_dataclasses()[0]
29+
30+
tokenizer = AutoTokenizer.from_pretrained("gpt2")
31+
32+
tokenizer.pad_token = tokenizer.eos_token
33+
34+
collator = RewardDataCollatorWithPadding(tokenizer=tokenizer)
35+
36+
lm_model = GPT2Model.from_pretrained("gpt2")
37+
lm_model.config.pad_token_id = tokenizer.eos_token_id
38+
39+
model = RewardModel(lm_model)
40+
41+
dataset = load_dataset("Anthropic/hh-rlhf")
42+
train_ds = dataset["train"]
43+
44+
train_dataset = train_ds.map(
45+
preprocess_function,
46+
batched=True, num_proc=1,
47+
fn_kwargs={"tokenizer": tokenizer}
48+
)
49+
50+
51+
trainer = RewardTrainer(
52+
model=model,
53+
args = trainer_args,
54+
train_dataset = train_dataset,
55+
data_collator = collator
56+
)
57+
58+
trainer.train()
59+
60+
if __name__ == '__main__':
61+
main()

0 commit comments

Comments
 (0)