Skip to content

Commit c8e8e61

Browse files
authored
opt-30b (#15)
1 parent 97ffea4 commit c8e8e61

File tree

10 files changed

+461
-64
lines changed

10 files changed

+461
-64
lines changed

examples/opt/generate_opt_30b.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from flagai.model.predictor.predictor import Predictor
2+
from flagai.auto_model.auto_loader import AutoLoader
3+
import torch
4+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
5+
6+
loader = AutoLoader(task_name="lm",
7+
model_name="opt-30b-en")
8+
9+
model = loader.get_model()
10+
tokenizer = loader.get_tokenizer()
11+
model.eval()
12+
model.to(device)
13+
14+
text = "The trophy doesn’t fit in the suitcase because "
15+
predictor = Predictor(model, tokenizer)
16+
out = predictor.predict_generate_randomsample(text,
17+
input_max_length=100,
18+
out_max_length=300,
19+
top_k=30,
20+
top_p=0.9,
21+
repetition_penalty=3.0)
22+
23+
print(f"input is {text} \n out is {out}")

examples/opt/opt_30b_en_mutigpu.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,2"
2+
import torch
3+
import os
4+
import argparse
5+
from flagai import mpu
6+
from flagai.auto_model.auto_loader import AutoLoader
7+
import random
8+
import numpy as np
9+
from flagai.model.predictor.predictor import Predictor
10+
11+
# run script : python -m torch.distributed.launch --nproc_per_node=2 --nnodes=1 glm_blank_filling_QA_ch_mutigpu.py
12+
os.environ["ENV_TYPE"] = "deepspeed+mpu"
13+
model_parallel_size = 4
14+
world_size = 4
15+
16+
os.environ["MODEL_PARALLEL_SIZE"] = str(model_parallel_size)
17+
os.environ["WORLD_SIZE"] = str(world_size)
18+
19+
def set_random_seed(seed):
20+
"""Set random seed for reproducability."""
21+
if seed is not None and seed > 0:
22+
random.seed(seed)
23+
np.random.seed(seed)
24+
torch.manual_seed(seed)
25+
mpu.model_parallel_cuda_manual_seed(seed)
26+
27+
parser = argparse.ArgumentParser()
28+
parser.add_argument('--local_rank',
29+
type=int,
30+
default=0,
31+
help="local_rank")
32+
33+
ds_args = parser.parse_args()
34+
local_rank = ds_args.local_rank
35+
36+
master_addr = os.environ.get('MASTER_ADDR', '127.0.0.1')
37+
master_port = os.environ.get('MASTER_PORT', '17501')
38+
39+
device = torch.device("cuda", local_rank)
40+
41+
def initialize_distributed():
42+
"""Initialize torch.distributed."""
43+
torch.backends.cudnn.enabled = False
44+
# Manually set the device ids.
45+
torch.cuda.set_device(device)
46+
# Call the init process
47+
init_method = 'tcp://'
48+
49+
init_method += master_addr + ':' + master_port
50+
torch.distributed.init_process_group(
51+
backend='nccl', # gloo
52+
world_size=world_size,
53+
rank=local_rank,
54+
init_method=init_method)
55+
mpu.initialize_model_parallel(model_parallel_size)
56+
57+
initialize_distributed()
58+
59+
set_random_seed(123)
60+
61+
loader = AutoLoader("lm", model_name="opt-350m-en")
62+
model = loader.get_model()
63+
model.half()
64+
tokenizer = loader.get_tokenizer()
65+
# model.parallel_output = False
66+
model.eval()
67+
model.to(device)
68+
69+
torch.distributed.barrier(group=mpu.get_model_parallel_group())
70+
71+
text = """I think The Old Man and the Sea is a very good book, what do you think? I think """
72+
73+
predictor = Predictor(model, tokenizer)
74+
out = predictor.predict_generate_randomsample(text)
75+
if mpu.get_model_parallel_rank() == 0:
76+
print(f"pred is {out}")
77+
78+

flagai/model/base_model.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,17 @@ def from_pretrain(cls,
9898
print(
9999
"preparing the model weights for model parallel size = {:02d}"
100100
.format(model_parallel_size))
101-
from flagai.mp_tools import change_pytorch_model_mp_from_1_to_n, check_pytorch_model_mp_size
101+
from flagai.auto_model.auto_loader import MODEL_DICT
102+
from flagai.mp_tools import change_pytorch_model_mp_from_1_to_n_new, check_pytorch_model_mp_size
102103
if model_parallel_size > 1 and not check_pytorch_model_mp_size(
103104
download_path, model_parallel_size):
104-
change_pytorch_model_mp_from_1_to_n(
105+
brief_model_name = MODEL_DICT[model_name.lower()][2]
106+
change_pytorch_model_mp_from_1_to_n_new(brief_model_name,
105107
download_path, model_parallel_size)
108+
109+
from flagai import mpu
110+
torch.distributed.barrier(group=mpu.get_model_parallel_group())
111+
106112
if model_parallel_size > 1:
107113
from flagai.mpu import get_model_parallel_rank
108114
model_parallel_rank = get_model_parallel_rank()

flagai/model/glm_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@ def forward(self,
458458
labels = kwargs['labels']
459459
if os.getenv("ENV_TYPE") == 'deepspeed+mpu':
460460
loss = vocab_parallel_cross_entropy(
461-
logits.contiguous().float(), labels).mean()
461+
logits_parallel.contiguous().float(), labels).mean()
462462
else:
463463

464464
loss = F.cross_entropy(

flagai/model/gpt2_model.py

Lines changed: 64 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,17 @@
88
from flagai.model.layers.embeddings import VocabParallelEmbedding
99
from flagai.model.utils import normal_init_method
1010
from flagai.model.base_model import BaseModel
11+
import torch.nn.functional as F
1112

1213
if os.getenv('ENV_TYPE') == 'deepspeed+mpu':
1314
from flagai.mpu import get_model_parallel_world_size
14-
from flagai.mpu import gather_from_model_parallel_region
1515
from flagai.mpu import get_cuda_rng_tracker
1616
from flagai.mpu.utils import divide
1717
if os.getenv('ENV_TYPE') == 'deepspeed+mpu':
18-
from flagai.mpu import copy_to_model_parallel_region
1918
from flagai.mpu.random import checkpoint
19+
from flagai.mpu import copy_to_model_parallel_region, gather_from_model_parallel_region
20+
from flagai.mpu.cross_entropy import vocab_parallel_cross_entropy
21+
2022
elif os.getenv('ENV_TYPE') == 'deepspeed':
2123
from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint
2224
else:
@@ -224,6 +226,7 @@ def __init__(self, config, **kwargs):
224226
do_layer_norm_before=self.config.get("do_layer_norm_before", True),
225227
)
226228
self.config = config_gpt
229+
self.parallel_output = True
227230

228231
self.transformer = GPT2Stack(self.config)
229232
self.lm_head = nn.Linear(self.config.n_embd,
@@ -266,21 +269,70 @@ def forward(
266269
output_attentions=output_attentions,
267270
output_hidden_states=output_hidden_states,
268271
)
269-
hidden_states = transformer_outputs
272+
logits = transformer_outputs
273+
274+
if os.getenv("ENV_TYPE") == 'deepspeed+mpu':
275+
logits_parallel = copy_to_model_parallel_region(logits)
276+
else:
277+
logits_parallel = logits
270278

271-
lm_logits = self.lm_head(hidden_states)
279+
# if self.output_predict:
280+
# Parallel logits.
281+
logits_parallel = F.linear(logits_parallel,
282+
self.transformer.wte.weight)
272283

273-
return_data = {"logits": lm_logits}
274284
if labels is not None:
275-
# Shift so that tokens < n predict n
276-
shift_logits = lm_logits[..., :-1, :].contiguous()
285+
shift_logits = logits_parallel[..., :-1, :].contiguous()
277286
shift_labels = labels[..., 1:].contiguous()
278-
loss_fct = nn.CrossEntropyLoss()
279-
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
280-
shift_labels.view(-1))
281-
return_data["loss"] = loss
282287

283-
return return_data
288+
if os.getenv("ENV_TYPE") == 'deepspeed+mpu':
289+
loss = vocab_parallel_cross_entropy(
290+
shift_logits.contiguous().float(), shift_labels).mean()
291+
else:
292+
loss = F.cross_entropy(
293+
shift_logits.contiguous().float(), shift_labels.long())
294+
295+
if self.parallel_output: # Put in different GPUs
296+
return {
297+
'logits': logits_parallel,
298+
'loss': loss,
299+
'hidden_states': None,
300+
}
301+
else:
302+
return {
303+
"logits":
304+
gather_from_model_parallel_region(logits_parallel),
305+
"loss":
306+
loss,
307+
"hidden_states":
308+
None,
309+
}
310+
else:
311+
if self.parallel_output: # Put in different GPUs
312+
return {
313+
'logits': logits_parallel,
314+
'hidden_states': None,
315+
}
316+
else:
317+
return {
318+
"logits":
319+
gather_from_model_parallel_region(logits_parallel),
320+
"hidden_states":
321+
None,
322+
}
323+
324+
# lm_logits = self.lm_head(hidden_states)
325+
# return_data = {"logits": lm_logits}
326+
# if labels is not None:
327+
# # Shift so that tokens < n predict n
328+
# shift_logits = lm_logits[..., :-1, :].contiguous()
329+
# shift_labels = labels[..., 1:].contiguous()
330+
# loss_fct = nn.CrossEntropyLoss()
331+
# loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
332+
# shift_labels.view(-1))
333+
# return_data["loss"] = loss
334+
335+
# return return_data
284336

285337
def load_weights(self, checkpoint_path):
286338
checkpoint = torch.load(checkpoint_path,

flagai/model/layers/attentions.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ def __init__(self,
6464
self.scale = scale
6565
if os.getenv("ENV_TYPE") == 'deepspeed+mpu':
6666
world_size = get_model_parallel_world_size()
67+
self.split_size = divide(n_state, world_size)
68+
6769
self.hidden_size_per_partition = divide(nx, world_size)
6870
self.hidden_size_per_attention_head = divide(nx, self.n_head)
6971
self.num_attention_heads_per_partition = divide(

flagai/model/opt_model.py

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -265,41 +265,41 @@ def __init__(self, config, **kwargs):
265265
# self.config = config
266266
self.transformer = OPTStack(self.config)
267267

268-
def forward(
269-
self,
270-
**data,
271-
):
272-
input_ids = data.get("input_ids", None)
273-
# attention_mask = data.get("attention_mask", None)
274-
# position_ids = data.get("position_ids", None)
275-
labels = data.get("labels", None)
276-
use_cache = data.get("use_cache", None)
277-
output_attentions = data.get("output_attentions", None)
278-
output_hidden_states = data.get("output_hidden_states", True)
279-
280-
transformer_outputs = self.transformer(
281-
input_ids,
282-
attention_mask=None,
283-
position_ids=None,
284-
use_cache=use_cache,
285-
output_attentions=output_attentions,
286-
output_hidden_states=output_hidden_states,
287-
)
288-
hidden_states = transformer_outputs
289-
290-
lm_logits = self.lm_head(hidden_states)
291-
292-
return_data = {"logits": lm_logits}
293-
if labels is not None:
294-
# Shift so that tokens < n predict n
295-
shift_logits = lm_logits[..., :-1, :].contiguous()
296-
shift_labels = labels[..., 1:].contiguous()
297-
loss_fct = nn.CrossEntropyLoss()
298-
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
299-
shift_labels.view(-1))
300-
return_data["loss"] = loss
301-
302-
return return_data
268+
# def forward(
269+
# self,
270+
# **data,
271+
# ):
272+
# input_ids = data.get("input_ids", None)
273+
# # attention_mask = data.get("attention_mask", None)
274+
# # position_ids = data.get("position_ids", None)
275+
# labels = data.get("labels", None)
276+
# use_cache = data.get("use_cache", None)
277+
# output_attentions = data.get("output_attentions", None)
278+
# output_hidden_states = data.get("output_hidden_states", True)
279+
#
280+
# transformer_outputs = self.transformer(
281+
# input_ids,
282+
# attention_mask=None,
283+
# position_ids=None,
284+
# use_cache=use_cache,
285+
# output_attentions=output_attentions,
286+
# output_hidden_states=output_hidden_states,
287+
# )
288+
# hidden_states = transformer_outputs
289+
#
290+
# lm_logits = self.lm_head(hidden_states)
291+
#
292+
# return_data = {"logits": lm_logits}
293+
# if labels is not None:
294+
# # Shift so that tokens < n predict n
295+
# shift_logits = lm_logits[..., :-1, :].contiguous()
296+
# shift_labels = labels[..., 1:].contiguous()
297+
# loss_fct = nn.CrossEntropyLoss()
298+
# loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
299+
# shift_labels.view(-1))
300+
# return_data["loss"] = loss
301+
#
302+
# return return_data
303303

304304

305305
def load_weights(self, checkpoint_path):

flagai/model/predictor/predictor.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,24 +8,12 @@
88
t5_random_sample, gpt_random_sample, \
99
t5_beamsearch, gpt_beamsearch, bert_random_sample, glm_beamsearch, glm_random_sample
1010
from typing import List, Union, Dict, Tuple, Any
11-
from flagai.model.bert_model import BertModel, BertForMaskLM, BertForSeq2seq, BertForSequenceLabeling, \
12-
BertForSequenceLabelingGP, BertForSequenceLabelingCRF, BertForClsClassifier
13-
from flagai.model.gpt2_model import GPT2Model
14-
from flagai.model.t5_model import T5Model
15-
from flagai.data.tokenizer.bert.bert_tokenizer import BertTokenizer
16-
from flagai.data.tokenizer.t5.t5_pegasus_tokenizer import T5PegasusTokenizer
17-
from flagai.data.tokenizer.glm_large_ch.glm_large_ch_tokenizer import GLMLargeChTokenizer
18-
1911

2012
class Predictor:
2113

2214
def __init__(self,
23-
model: Union[BertModel, GPT2Model, BertForSequenceLabelingGP,
24-
BertForSequenceLabelingCRF, BertForClsClassifier,
25-
BertForClsClassifier, BertForSequenceLabeling,
26-
BertForSeq2seq, T5Model, BertForMaskLM],
27-
tokenizer: Union[GLMLargeChTokenizer, BertTokenizer,
28-
T5PegasusTokenizer]):
15+
model,
16+
tokenizer):
2917
"""
3018
Args:
3119
model: The model loaded by the AutoLoader class.

0 commit comments

Comments
 (0)