Skip to content

Commit cb46a5d

Browse files
authored
add gpt/gpt2/t5 mini implementation
1 parent 0b8e1e2 commit cb46a5d

File tree

3 files changed

+162
-0
lines changed

3 files changed

+162
-0
lines changed

src/models/T5.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# coding: UTF-8
2+
import torch
3+
import torch.nn as nn
4+
from transformers import AutoModel, AutoTokenizer
5+
6+
7+
class Config(object):
8+
"""配置参数"""
9+
10+
def __init__(self, dataset, pretrained_name_or_path=None):
11+
self.model_name = 'T5'
12+
self.train_path = dataset + '/data/train.txt' # 训练集
13+
self.dev_path = dataset + '/data/dev.txt' # 验证集
14+
self.test_path = dataset + '/data/test.txt' # 测试集
15+
self.class_list = [x.strip() for x in open(
16+
dataset + '/data/class.txt').readlines()] # 类别名单
17+
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果
18+
self.log_path = dataset + '/log/' + self.model_name
19+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备
20+
21+
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
22+
self.multi_label = False
23+
self.num_classes = len(self.class_list) # 类别数
24+
self.num_epochs = 3 # epoch数
25+
self.batch_size = 128 # mini-batch大小
26+
self.pad_size = 32 # 每句话处理成的长度(短填长切)
27+
self.learning_rate = 5e-5 # 学习率
28+
self.encoder_path = './t5_pretrain' if not pretrained_name_or_path else pretrained_name_or_path
29+
self.tokenizer = AutoTokenizer.from_pretrained(self.encoder_path)
30+
self.hidden_size = 768
31+
32+
33+
class Model(nn.Module):
34+
35+
def __init__(self, config):
36+
super(Model, self).__init__()
37+
self.encoder = AutoModel.from_pretrained(config.encoder_path)
38+
self.tokenizer = config.tokenizer
39+
self.device = config.device
40+
for param in self.encoder.parameters():
41+
param.requires_grad = True
42+
self.fc = nn.Linear(config.hidden_size, config.num_classes)
43+
44+
def forward(self, x):
45+
context = x[0] # 输入的句子
46+
mask = x[2] # 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
47+
# ref:https://discuss.huggingface.co/t/t5-classification-using-text2text/504/8
48+
decoder_input_ids = torch.tensor([self.tokenizer.pad_token_id]).unsqueeze(0).expand(context.size(0), -1).to(
49+
self.device)
50+
# outputs:torch.Size([128, 1, 768])
51+
outputs = self.encoder(context, attention_mask=mask, decoder_input_ids=decoder_input_ids,
52+
return_dict=True).last_hidden_state
53+
# token_ids = torch.argmax(outputs, dim=2)
54+
# tokens = self.tokenizer.batch_decode(token_ids)
55+
56+
outputs = self.fc(outputs.squeeze())
57+
return outputs

src/models/gpt.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# coding: UTF-8
2+
import torch
3+
import torch.nn as nn
4+
from transformers import OpenAIGPTModel, OpenAIGPTTokenizer
5+
6+
7+
class Config(object):
8+
"""配置参数"""
9+
10+
def __init__(self, dataset, pretrained_name_or_path=None):
11+
self.model_name = 'gpt'
12+
self.train_path = dataset + '/data/train.txt' # 训练集
13+
self.dev_path = dataset + '/data/dev.txt' # 验证集
14+
self.test_path = dataset + '/data/test.txt' # 测试集
15+
self.class_list = [x.strip() for x in open(
16+
dataset + '/data/class.txt').readlines()] # 类别名单
17+
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果
18+
self.log_path = dataset + '/log/' + self.model_name
19+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备
20+
21+
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
22+
self.multi_label = False
23+
self.num_classes = len(self.class_list) # 类别数
24+
self.num_epochs = 3 # epoch数
25+
self.batch_size = 128 # mini-batch大小
26+
self.pad_size = 32 # 每句话处理成的长度(短填长切)
27+
self.learning_rate = 5e-5 # 学习率
28+
self.encoder_path = './gpt_pretrain' if not pretrained_name_or_path else pretrained_name_or_path
29+
self.tokenizer = OpenAIGPTTokenizer.from_pretrained(self.encoder_path)
30+
self.hidden_size = 768
31+
32+
33+
class Model(nn.Module):
34+
35+
def __init__(self, config):
36+
super(Model, self).__init__()
37+
self.encoder = OpenAIGPTModel.from_pretrained(config.encoder_path)
38+
for param in self.encoder.parameters():
39+
param.requires_grad = True
40+
self.fc = nn.Linear(config.hidden_size, config.num_classes)
41+
42+
def forward(self, x):
43+
context = x[0] # 输入的句子
44+
mask = x[2] # 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
45+
# 模型输出:字典或(last_hidden_state,hidden_states=None, attentions=None)
46+
# causal LM uses the last token in order to do the classification
47+
# 另可参考GPT2ForSequenceClassification的实现:TODO
48+
# x[1]即为长度,当然也可以根据mask计算
49+
lengths = torch.sum(mask, dim=1) - 1 # 减去1是因为序列索引是从0开始的
50+
pooled = self.encoder(context, attention_mask=mask, return_dict=True).last_hidden_state
51+
last_indices = lengths.unsqueeze(1).unsqueeze(2).expand(-1, -1, pooled.size(-1)) # 扩展维度以与 pooled 匹配
52+
pooled = torch.gather(pooled, dim=1, index=last_indices).squeeze(1)
53+
out = self.fc(pooled)
54+
return out

src/models/gpt2.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# coding: UTF-8
2+
import torch
3+
import torch.nn as nn
4+
from transformers import GPT2Model, GPT2Tokenizer
5+
6+
7+
class Config(object):
8+
"""配置参数"""
9+
10+
def __init__(self, dataset, pretrained_name_or_path=None):
11+
self.model_name = 'gpt2'
12+
self.train_path = dataset + '/data/train.txt' # 训练集
13+
self.dev_path = dataset + '/data/dev.txt' # 验证集
14+
self.test_path = dataset + '/data/test.txt' # 测试集
15+
self.class_list = [x.strip() for x in open(
16+
dataset + '/data/class.txt').readlines()] # 类别名单
17+
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果
18+
self.log_path = dataset + '/log/' + self.model_name
19+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备
20+
21+
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
22+
self.multi_label = False
23+
self.num_classes = len(self.class_list) # 类别数
24+
self.num_epochs = 3 # epoch数
25+
self.batch_size = 128 # mini-batch大小
26+
self.pad_size = 32 # 每句话处理成的长度(短填长切)
27+
self.learning_rate = 5e-5 # 学习率
28+
self.encoder_path = './gpt2_pretrain' if not pretrained_name_or_path else pretrained_name_or_path
29+
self.tokenizer = GPT2Tokenizer.from_pretrained(self.encoder_path)
30+
self.hidden_size = 768
31+
32+
33+
class Model(nn.Module):
34+
35+
def __init__(self, config):
36+
super(Model, self).__init__()
37+
self.encoder = GPT2Model.from_pretrained(config.encoder_path)
38+
for param in self.encoder.parameters():
39+
param.requires_grad = True
40+
self.fc = nn.Linear(config.hidden_size, config.num_classes)
41+
42+
def forward(self, x):
43+
context = x[0] # 输入的句子
44+
mask = x[2] # 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
45+
# 模型输出:
46+
lengths = torch.sum(mask, dim=1) - 1 # 减去1是因为序列索引是从0开始的
47+
pooled = self.encoder(context, attention_mask=mask, return_dict=True).last_hidden_state
48+
last_indices = lengths.unsqueeze(1).unsqueeze(2).expand(-1, -1, pooled.size(-1)) # 扩展维度以与 pooled 匹配
49+
pooled = torch.gather(pooled, dim=1, index=last_indices).squeeze(1)
50+
out = self.fc(pooled)
51+
return out

0 commit comments

Comments
 (0)