forked from deepjavalibrary/djl-serving
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[feature] Enable model sharding on seq_scheduler tested on gpt_neox_2…
…0B (deepjavalibrary#1086) Co-authored-by: KexinFeng <fenkexin@amazon.com>
- Loading branch information
Showing
4 changed files
with
219 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
60 changes: 60 additions & 0 deletions
60
...python/setup/djl_python/tests/rolling_batch_test_scripts/test_rolling_batch_scheduler2.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
from collections import defaultdict | ||
import torch | ||
from djl_python.rolling_batch import SchedulerRollingBatch | ||
import torch.distributed as dist | ||
|
||
|
||
def print_rank0(content): | ||
if not dist.is_initialized() or dist.get_rank() == 0: | ||
print(content) | ||
|
||
|
||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
|
||
properties = { | ||
"tensor_parallel_degree": 2, | ||
"dtype": "fp16", | ||
"max_rolling_batch_size": 8, | ||
"model_loading_timeout": 7200, | ||
"max_rolling_batch_prefill_tokens": 10000, | ||
"paged_attention": "True" | ||
} | ||
|
||
model_id = "EleutherAI/gpt-neox-20b" | ||
""" | ||
{"inputs":"write a program to add two numbers in python","parameters":{"max_new_tokens":1000, "do_sample":true, "temperature":0.7}} | ||
""" | ||
|
||
input_str = [ | ||
"Memories follow me left and right", "Memories follow me left and right." | ||
] | ||
|
||
params = [{ | ||
"max_new_tokens": 50, | ||
"do_sample": False, | ||
"temperature": 0.000007 | ||
}, { | ||
"max_new_tokens": 50, | ||
"do_sample": False, | ||
"temperature": 0.000007 | ||
}] | ||
|
||
# ===================== lmi ============================ | ||
print("=========== lmi =========") | ||
rolling_batch = SchedulerRollingBatch(model_id, device, properties) | ||
rolling_batch.output_formatter = None | ||
print("reach here") | ||
|
||
output_all = defaultdict(list) | ||
result = rolling_batch.inference(input_str, params) | ||
for i, res in enumerate(result): | ||
output_all[i].append(res['data']) | ||
|
||
for _ in range(50): | ||
result = rolling_batch.inference(input_str, params) | ||
for i, res in enumerate(result): | ||
output_all[i].append(res['data']) | ||
|
||
for i, out in enumerate(output_all.values()): | ||
print_rank0(input_str[i] + ''.join(out)) | ||
print_rank0('\n====') |
126 changes: 126 additions & 0 deletions
126
engines/python/setup/djl_python/tests/rolling_batch_test_scripts/test_scheduler_sharded.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
from djl_python.scheduler.lm_block import HuggingfaceBlock | ||
from djl_python.scheduler.seq_batch_scheduler import SeqBatchScheduler | ||
from transformers import AutoConfig | ||
from djl_python.scheduler.search_config import SearchConfig | ||
import torch | ||
from transformers import AutoTokenizer | ||
|
||
from lmi_dist.models.gpt_neox import GPTNeoxSharded | ||
from lmi_dist.utils import download_and_convert_weights | ||
|
||
global_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
|
||
|
||
class TestSchedulerSharded: | ||
|
||
def test_lm_block(self): | ||
model_id = "EleutherAI/gpt-neox-20b" | ||
download_and_convert_weights(model_id) | ||
model = GPTNeoxSharded(model_id) | ||
|
||
device = model.device | ||
tokenizer = AutoTokenizer.from_pretrained(model_id) | ||
|
||
encoding = tokenizer("Hello, my dog is cute", return_tensors="pt") | ||
input_ids_0 = encoding.data['input_ids'] | ||
seq_len = input_ids_0.shape[1] | ||
|
||
lm_block = HuggingfaceBlock(model) | ||
|
||
input0 = [ | ||
torch.repeat_interleave(input_ids_0, dim=0, repeats=2).to(device), | ||
torch.repeat_interleave(torch.arange(seq_len)[None, :], | ||
dim=0, | ||
repeats=2).to(device), | ||
torch.repeat_interleave(torch.ones(seq_len, | ||
dtype=torch.int64)[None, :], | ||
dim=0, | ||
repeats=2).to(device) | ||
] | ||
|
||
output0 = lm_block.forward(*input0, None) | ||
|
||
model_config = AutoConfig.from_pretrained(model_id) | ||
assert len(output0.past_key_values) == model_config.num_hidden_layers | ||
|
||
# input with kv_cache | ||
# k: [32, 64, 6], v: [32, 6, 64], [batch*head, kvDim, seq] | ||
past_key_values = output0.past_key_values | ||
input_ids = torch.tensor([[404], [405]]).to(device) | ||
past_seq = past_key_values[0][0].shape[-2] | ||
position_ids = torch.tensor([[past_seq], [past_seq]]).to(device) | ||
attention_mask = torch.ones(2, past_seq + 1, | ||
dtype=torch.int64).to(device) | ||
output1 = lm_block.forward(input_ids, position_ids, attention_mask, | ||
past_key_values) | ||
assert len(output1.past_key_values) == model_config.num_hidden_layers | ||
|
||
def test_contrastive_scheduler(self): | ||
model_id = "EleutherAI/gpt-neox-20b" | ||
download_and_convert_weights(model_id) | ||
model = GPTNeoxSharded(model_id) | ||
|
||
device = model.device | ||
tokenizer = AutoTokenizer.from_pretrained(model_id, | ||
padding_side='left') | ||
tokenizer.pad_token = tokenizer.eos_token | ||
|
||
lm_block = HuggingfaceBlock(model) | ||
|
||
search_config = SearchConfig() | ||
search_config.pad_token_id = tokenizer.pad_token_id | ||
PAD = search_config.pad_token_id | ||
scheduler = SeqBatchScheduler(lm_block, "contrastive", search_config) | ||
|
||
input_ids_0 = tokenizer.encode( | ||
'Memories follow me left and right. I can', | ||
return_tensors='pt').to(device) | ||
request_ids = torch.tensor([[0]]) | ||
|
||
# Test init_forward | ||
scheduler.add_request(input_ids_0, request_ids) | ||
|
||
# Merge longer sequences | ||
input12 = [ | ||
r"When your legs don't work like they used to before And I can't sweep you off", | ||
r"There's a time that I remember, when I did not know" | ||
] | ||
input_ids = tokenizer(input12, return_tensors='pt', | ||
padding=True).input_ids.to(device) | ||
|
||
request_ids = torch.tensor([[1], [2]]) | ||
scheduler.add_request(input_ids, request_ids) | ||
|
||
# Forward pass | ||
for _ in scheduler.increment_forward(20): | ||
pass | ||
|
||
results = scheduler.results | ||
|
||
# Merge shorter sequences | ||
input_ids_1 = tokenizer.encode("When your legs don't work", | ||
return_tensors='pt') | ||
input_ids_2 = torch.concat([ | ||
torch.tensor([PAD, PAD]), | ||
tokenizer.encode("There's a time", return_tensors='pt')[0] | ||
]).view(1, -1) | ||
input_ids = torch.concat([input_ids_1, input_ids_2], dim=0).to(device) | ||
request_ids = torch.tensor([[3], [4]]) | ||
|
||
scheduler.add_request(input_ids, request_ids) | ||
|
||
# Forward pass | ||
for _ in scheduler.increment_forward(100): | ||
pass | ||
|
||
for i, ret in results.items(): | ||
print('\n{}:'.format(i), tokenizer.decode(ret)) | ||
|
||
|
||
if __name__ == '__main__': | ||
# unittest.main() | ||
|
||
c = TestSchedulerSharded() | ||
# c.test_lm_block() | ||
# c.test_contrastive_scheduler() |