Skip to content

Commit bc9063a

Browse files
committed
resolve rebase conflicts on Branch feat/online-serving
1 parent 61a1b2e commit bc9063a

File tree

4 files changed

+31
-13
lines changed

4 files changed

+31
-13
lines changed

colossalai/inference/core/engine.py

+3-10
Original file line numberDiff line numberDiff line change
@@ -527,16 +527,9 @@ def generate(
527527
List[str]: Inference result returned by one generation.
528528
"""
529529
with torch.inference_mode():
530-
<<<<<<< HEAD
531-
532530
if isinstance(prompts, str) and isinstance(request_ids, int):
533-
prompts = [prompts]
534-
request_ids = [request_ids]
535-
=======
536-
if prompts is not None or prompts_token_ids is not None:
537-
self.add_request(request_ids=request_ids, prompts=prompts, prompts_token_ids=prompts_token_ids)
538-
>>>>>>> [Inference] Fix bugs and docs for feat/online-server (#5598)
539-
531+
prompts = [prompts]
532+
request_ids = [request_ids]
540533
if prompts is not None or prompts_token_ids is not None:
541534
gen_config_dict = generation_config.to_dict() if generation_config is not None else {}
542535
self.add_request(
@@ -545,7 +538,7 @@ def generate(
545538
prompts_token_ids=prompts_token_ids,
546539
**gen_config_dict,
547540
)
548-
541+
549542
output_seqs_list = []
550543
total_tokens_list = []
551544

colossalai/inference/server/README.md

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Online Service
2+
Colossal-Inference supports fast-api based online service. Simple completion and chat are both supported. Follow the commands below and
3+
you can simply construct a server with both completion and chat functionalities. For now we only support `Llama` model, we will fullfill
4+
the blank quickly.
5+
6+
# Usage
7+
```bash
8+
# First, Lauch an API locally.
9+
python3 -m colossalai.inference.server.api_server --model path of your llama2 model --chat_template "{% for message in messages %}
10+
{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}"
11+
12+
13+
# Second, you can turn to the page `http://127.0.0.1:8000/docs` to check the api
14+
15+
# For completion service, you can invoke it
16+
curl -X POST http://127.0.0.1:8000/completion -H 'Content-Type: application/json' -d '{"prompt":"hello, who are you? ","stream":"False"}'
17+
18+
# For chat service, you can invoke it
19+
curl -X POST http://127.0.0.1:8000/completion -H 'Content-Type: application/json' -d '{"converation":
20+
[{"role": "system", "content": "you are a helpful assistant"},
21+
{"role": "user", "content": "what is 1+1?"},],
22+
"stream": "False",}'
23+
# If you just want to test a simple generation, turn to generate api
24+
curl -X POST http://127.0.0.1:8000/generate -H 'Content-Type: application/json' -d '{"prompt":"hello, who are you? ","stream":"False"}'
25+
26+
```
27+
We also support streaming output, simply change the `stream` to `True` in the request body.

colossalai/kernel/triton/no_pad_rotary_embedding.py

-2
Original file line numberDiff line numberDiff line change
@@ -598,8 +598,6 @@ def decoding_fused_rotary_embedding(
598598
"""
599599
q_total_tokens, q_head_num, head_dim = q.shape
600600
assert q.size(0) == k.size(0) == v.size(0)
601-
assert k.size(1) == v.size(1)
602-
assert k_cache.size(-1) == v_cache.size(-1)
603601

604602
if head_dim >= 512:
605603
num_warps = 16

tests/test_infer/test_continuous_batching.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def check_continuous_batching(prompt_template):
8989

9090

9191
def run_dist(rank, world_size, port):
92-
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost")
92+
colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost")
9393
check_continuous_batching()
9494

9595

0 commit comments

Comments
 (0)