forked from deepjavalibrary/djl-serving
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[LMIDist] Add test that calls
LmiDistRollingBatch
with python script (
deepjavalibrary#1443) Co-authored-by: KexinFeng <fenkexin@amazon.com>
- Loading branch information
Showing
1 changed file
with
177 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,177 @@ | ||
#!/usr/bin/env python | ||
# | ||
# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file | ||
# except in compliance with the License. A copy of the License is located at | ||
# | ||
# http://aws.amazon.com/apache2.0/ | ||
# | ||
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" | ||
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for | ||
# the specific language governing permissions and limitations under the License. | ||
|
||
import unittest | ||
|
||
import torch | ||
import os | ||
import sys | ||
|
||
|
||
class TestLmiDist(unittest.TestCase): | ||
|
||
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is not available") | ||
def test_models(self): | ||
# === Preparation === | ||
script_directory = os.path.dirname(os.path.abspath(__file__)) | ||
relative_path = "../../" | ||
new_path = os.path.normpath( | ||
os.path.join(script_directory, relative_path)) | ||
sys.path.append(new_path) | ||
sys.path.append("/usr/local/lib/python3.9/dist-packages/lmi_dist") | ||
from djl_python.rolling_batch.lmi_dist_rolling_batch import LmiDistRollingBatch | ||
from djl_python.tests.rolling_batch_test_scripts.generator import Generator, print_rank0 | ||
|
||
# --- Models --- | ||
model_names = [ | ||
"TheBloke/Llama-2-13B-Chat-fp16", | ||
# "TheBloke/Llama-2-7B-Chat-fp16", | ||
# TODO: fix this. weight model.layers.0.self_attn.rotary_emb.inv_freq does not exist | ||
# "TheBloke/Llama-2-7B-Chat-AWQ", | ||
# "TinyLlama/TinyLlama-1.1B-Chat-v0.6", | ||
# TODO: fix this. weight model.layers.0.self_attn.rotary_emb.inv_freq does not exist | ||
# "TinyLlama/TinyLlama-1.1B-python-v0.1", | ||
# g5.12xlarge single gpu ok. But no way to clear the gpu memory after running llama-2-7b thus cause OOM | ||
# "codellama/CodeLlama-7b-hf" | ||
] | ||
expected_text_30 = { | ||
"TheBloke/Llama-2-13B-Chat-fp16": { | ||
1: | ||
'Hello, my name is Dr. [Last Name] and I am a licensed clinical psychologist with over 10 years of experience working with children, ad', | ||
2: | ||
'The president of the United States is the head of the executive branch and the highest-ranking official in the federal government. The president is elected by the people through the Electoral', | ||
3: | ||
'The capital of France is Paris.\n\nThe capital of Germany is Berlin.\n\nThe capital of Italy is Rome.\n\nThe capital of Spain is Madrid.', | ||
4: | ||
'The future of AI is not just about building smarter machines, but also about ensuring that these machines are used for the betterment of society.\n\nAs A', | ||
5: | ||
'Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is' | ||
}, | ||
"TheBloke/Llama-2-7B-Chat-fp16": { | ||
1: | ||
'Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of experience. I am reaching out to you today', | ||
2: | ||
'The president of the United States is the head of the executive branch of the federal government and is one of the most powerful political figures in the world. The president is elected by the', | ||
3: | ||
'The capital of France is Paris. It is located in the northern central part of the country and is known for its stunning architecture, art museums, fashion, and', | ||
4: | ||
"The future of AI is bright, but it's not without its challenges. Here are some of the biggest challenges that AI will face in the future:", | ||
5: | ||
'Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is' | ||
}, | ||
"TinyLlama/TinyLlama-1.1B-Chat-v0.6": { | ||
1: | ||
"Hello, my name is [Your Name] and I am a [Your Job Title] at [Your Company Name]. I am interested in learning more about your company'", | ||
2: | ||
'The president of the United States is a man named Donald Trump.\n\n2. The president of the United States is a man named Donald Trump.\n\n3. The president', | ||
3: | ||
'The capital of France is Paris.\n\n2. The capital of the United States is Washington, D.C.\n\n3. The capital of Canada is Ott', | ||
4: | ||
"The future of AI is bright, and it's already here. With the help of AI, we can create more personalized experiences, automate repetitive tasks", | ||
5: | ||
'Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is Hello, my name is' | ||
} | ||
} | ||
|
||
# === Test === | ||
for model_id in model_names: | ||
properties = { | ||
"mpi_mode": "true", | ||
"tensor_parallel_degree": 1, | ||
"dtype": "fp16", | ||
"max_rolling_batch_size": 28, | ||
"model_loading_timeout": 3600, | ||
"model_id": model_id | ||
} | ||
|
||
# ===================== lmi_dist ============================ | ||
device = int(os.environ.get("RANK", 0)) | ||
properties["device"] = int(os.environ.get("RANK", 0)) | ||
|
||
rolling_batch = LmiDistRollingBatch(model_id, device, properties) | ||
rolling_batch.output_formatter = None | ||
|
||
gen = Generator(rolling_batch=rolling_batch) | ||
|
||
print('========== init inference ===========') | ||
input_str1 = [ | ||
"Hello, my name is", # 6 | ||
"The president of the United States is", # 8 | ||
"The capital of France is", # 6 | ||
"The future of AI is" | ||
] # 7 | ||
|
||
params1 = [{ | ||
"max_new_tokens": 100, | ||
"do_sample": False, | ||
"temperature": 0.001 | ||
}, { | ||
"max_new_tokens": 100, | ||
"do_sample": False, | ||
"temperature": 0.001 | ||
}, { | ||
"max_new_tokens": 100, | ||
"do_sample": False, | ||
"temperature": 0.001 | ||
}, { | ||
"max_new_tokens": 100, | ||
"do_sample": False, | ||
"temperature": 0.001 | ||
}] | ||
|
||
gen.step(step=10, input_str_delta=input_str1, params_delta=params1) | ||
|
||
for _ in range(1): | ||
print('========== inference_1 ===========') | ||
input_str_delta = [ | ||
"Hello, my name is Hello, my name is Hello, my name is Hello, my name is", # 22 | ||
"Hello, my name is Hello, my name is Hello, my name is" | ||
] # 17 | ||
|
||
params_delta = [{ | ||
"max_new_tokens": 100, | ||
"do_sample": False, | ||
"temperature": 0.001 | ||
}, { | ||
"max_new_tokens": 100, | ||
"do_sample": False, | ||
"temperature": 0.001 | ||
}] | ||
|
||
gen.step(step=10, | ||
input_str_delta=input_str_delta, | ||
params_delta=params_delta) | ||
|
||
print('========== inference_infty ===========') | ||
gen.step(step=500) | ||
for req_id, out in gen.output_all.items(): | ||
print_rank0( | ||
f"\n====req_id: {req_id}=====\n{gen.input_all[req_id][0] + ''.join(out)}\n" | ||
) | ||
if model_id in expected_text_30 and req_id in expected_text_30: | ||
assert expected_text_30[model_id][ | ||
req_id] == gen.input_all[req_id][0] + ''.join(out[:30]) | ||
|
||
# Reset | ||
rolling_batch.reset() | ||
rolling_batch.model = None | ||
rolling_batch = None | ||
import gc | ||
gc.collect() | ||
torch.cuda.empty_cache() | ||
torch.cuda.reset_max_memory_allocated() | ||
torch.cuda.reset_max_memory_cached() | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |