Skip to content

Commit ea424c7

Browse files
committed
fix unit test
2 parents ad7cb7a + f88c159 commit ea424c7

80 files changed

Lines changed: 2461 additions & 463 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

build.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@ function copy_ops(){
9696
TMP_PACKAGE_DIR="${LEGACY_PACKAGE_DIR}"
9797
else
9898
echo -e "${RED}[Error]${NONE} Neither modern nor legacy directory for gpu ops found in ${OPS_TMP_DIR}"
99+
echo -e "${BLUE}[Info]${NONE} Maybe the compilation failed, please clean the build directory (currently ${BUILD_DIR}) and egg directory (currently ${EGG_DIR}) and try again."
100+
echo -e "${BLUE}[Info]${NONE} If the build still fails, please try to use a clean FastDeploy code and a clean environment to compile again."
101+
exit 1
99102
fi
100103

101104
# Handle CPU ops directory compatibility between modern and legacy naming

custom_ops/gpu_ops/get_output.cc

Lines changed: 39 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -26,71 +26,70 @@
2626
#define MAX_BSZ 512
2727
// #define GET_OUTPUT_DEBUG
2828
struct msgdata {
29-
long mtype;
30-
int mtext[MAX_BSZ + 2]; // stop_flag, bsz, tokens
29+
long mtype;
30+
int mtext[MAX_BSZ + 2]; // stop_flag, bsz, tokens
3131
};
3232

3333
void GetOutput(const paddle::Tensor& x,
3434
int64_t rank_id,
3535
bool wait_flag,
3636
int msg_queue_id) {
37-
if (rank_id > 0) {
38-
return;
39-
}
40-
static struct msgdata msg_rcv;
41-
if (const char* inference_msg_queue_id_env_p =
42-
std::getenv("INFERENCE_MSG_QUEUE_ID")) {
43-
std::string inference_msg_queue_id_env_str(
44-
inference_msg_queue_id_env_p);
45-
int inference_msg_queue_id_from_env =
46-
std::stoi(inference_msg_queue_id_env_str);
37+
if (rank_id > 0) {
38+
return;
39+
}
40+
static struct msgdata msg_rcv;
41+
if (const char* inference_msg_queue_id_env_p =
42+
std::getenv("INFERENCE_MSG_QUEUE_ID")) {
43+
std::string inference_msg_queue_id_env_str(inference_msg_queue_id_env_p);
44+
int inference_msg_queue_id_from_env =
45+
std::stoi(inference_msg_queue_id_env_str);
4746
#ifdef GET_OUTPUT_DEBUG
48-
std::cout << "Your INFERENCE_MSG_QUEUE_ID is: "
49-
<< inference_msg_queue_id_from_env << std::endl;
47+
std::cout << "Your INFERENCE_MSG_QUEUE_ID is: "
48+
<< inference_msg_queue_id_from_env << std::endl;
5049
#endif
51-
msg_queue_id = inference_msg_queue_id_from_env;
52-
}
53-
static key_t key = ftok("/dev/shm", msg_queue_id);
54-
static int msgid = msgget(key, IPC_CREAT | 0666);
50+
msg_queue_id = inference_msg_queue_id_from_env;
51+
}
52+
static key_t key = ftok("/dev/shm", msg_queue_id);
53+
static int msgid = msgget(key, IPC_CREAT | 0666);
5554

5655
#ifdef GET_OUTPUT_DEBUG
57-
std::cout << "get_output_key: " << key << std::endl;
58-
std::cout << "get_output msgid: " << msgid << std::endl;
56+
std::cout << "get_output_key: " << key << std::endl;
57+
std::cout << "get_output msgid: " << msgid << std::endl;
5958
#endif
6059

61-
int64_t* out_data = const_cast<int64_t*>(x.data<int64_t>());
62-
int ret = -1;
63-
if (!wait_flag) {
64-
ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ + 2) * 4, 0, IPC_NOWAIT);
65-
} else {
66-
ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ + 2) * 4, 0, 0);
67-
}
68-
if (ret == -1) {
69-
out_data[0] = -2;
70-
out_data[1] = 0;
71-
return;
72-
}
73-
int bsz = msg_rcv.mtext[1];
60+
int64_t* out_data = const_cast<int64_t*>(x.data<int64_t>());
61+
int ret = -1;
62+
if (!wait_flag) {
63+
ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ + 2) * 4, 0, IPC_NOWAIT);
64+
} else {
65+
ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ + 2) * 4, 0, 0);
66+
}
67+
if (ret == -1) {
68+
out_data[0] = -2;
69+
out_data[1] = 0;
70+
return;
71+
}
72+
int bsz = msg_rcv.mtext[1];
7473

75-
for (int64_t i = 0; i < bsz + 2; i++) {
76-
out_data[i] = (int64_t)msg_rcv.mtext[i];
77-
}
74+
for (int64_t i = 0; i < bsz + 2; i++) {
75+
out_data[i] = (int64_t)msg_rcv.mtext[i];
76+
}
7877
#ifdef GET_OUTPUT_DEBUG
79-
std::cout << "get_output finished: " << msgid << std::endl;
78+
std::cout << "get_output finished: " << msgid << std::endl;
8079
#endif
8180

82-
return;
81+
return;
8382
}
8483

8584
void GetOutputStatic(const paddle::Tensor& x, int64_t rank_id, bool wait_flag) {
86-
GetOutput(x, rank_id, wait_flag, 1);
85+
GetOutput(x, rank_id, wait_flag, 1);
8786
}
8887

8988
void GetOutputDynamic(const paddle::Tensor& x,
9089
int64_t rank_id,
9190
bool wait_flag,
9291
int msg_queue_id) {
93-
GetOutput(x, rank_id, wait_flag, msg_queue_id);
92+
GetOutput(x, rank_id, wait_flag, msg_queue_id);
9493
}
9594

9695
PD_BUILD_STATIC_OP(get_output)

custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -989,8 +989,20 @@ std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
989989
paddle::DataType::FLOAT32,
990990
place);
991991

992-
auto m_indices =
993-
GetEmptyTensor({token_nums_feed_to_ffn}, paddle::DataType::INT32, place);
992+
paddle::Tensor m_indices;
993+
if (use_in_ep) {
994+
m_indices = GetEmptyTensor(
995+
{token_nums_feed_to_ffn}, paddle::DataType::INT32, place);
996+
} else {
997+
// Note(ZKK)
998+
// In TP, we must init m_indices with -1,
999+
// because we allocate too much space.
1000+
// token_rows * moe_topk + num_experts_per_rank * (128 - 1)
1001+
// Later will optimize this.
1002+
m_indices = paddle::full(
1003+
{token_nums_feed_to_ffn}, -1, paddle::DataType::INT32, place);
1004+
}
1005+
9941006
auto token_nums_per_expert_cumsum =
9951007
GetEmptyTensor({num_experts_per_rank}, paddle::DataType::INT64, place);
9961008
auto token_nums_per_expert_padded_cumsum =

custom_ops/xpu_ops/build.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ elif [ -d "${LEGACY_PACKAGE_DIR}" ]; then
5454
CUSTOM_OP_DLL_PATH="${TMP_PACKAGE_DIR}/fastdeploy_ops_pd_.so"
5555
else
5656
echo -e "${RED}[Error]${NONE} Neither modern nor legacy directory for xpu ops found in ${OPS_TMP_DIR}"
57+
echo -e "${BLUE}[Info]${NONE} Maybe the compilation failed, please clean the build directory and try again."
58+
echo -e "${BLUE}[Info]${NONE} If the build still fails, please try to use a clean FastDeploy code and a clean environment to compile again."
59+
exit 1
5760
fi
5861

5962
mkdir -p ${TMP_PACKAGE_DIR}/libs

docs/best_practices/FAQ.md

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,100 @@ export ENABLE_V1_KVCACHE_SCHEDULER=1
3737
```
3838

3939
2. Check whether the KVCache blocks allocated by the automatic profile are as expected. If the automatic profile is affected by the fluctuation of video memory and may result in less allocation, you can manually set the `num_gpu_blocks_override` parameter to expand the KVCache block.
40+
41+
## 3. How much concurrency can the service support?
42+
43+
1. It is recommended to configure the following environment variable when deploying the service:
44+
45+
```
46+
export ENABLE_V1_KVCACHE_SCHEDULER=1
47+
```
48+
49+
2. When starting the service, you need to configure `max-num-seqs`.
50+
This parameter specifies the maximum batch size during the Decode phase.
51+
If the concurrency exceeds this value, the extra requests will be queued.
52+
Under normal circumstances, you can set `max-num-seqs` to **128** to keep it relatively high; the actual concurrency is determined by the load-testing client.
53+
54+
3. `max-num-seqs` represents only the upper limit you configure.
55+
The **actual** concurrency the service can handle depends on the size of the **KVCache**.
56+
After the service starts, check `log/worker_process.log` and look for logs similar to:
57+
58+
```
59+
num_blocks_global: 17131
60+
```
61+
62+
This indicates that the current service has **17131 KVCache blocks**.
63+
With `block_size = 64` (default), the total number of tokens that can be cached is:
64+
65+
```
66+
17131 * 64 = 1,096,384 tokens
67+
```
68+
69+
If the average total number of tokens per request (input + output) is **20K**, then the service can actually support approximately:
70+
71+
```
72+
1,096,384 / 20,000 ≈ 53 concurrent requests
73+
```
74+
75+
## 4. Inference Request Stalls After Enabling logprobs
76+
77+
When **logprobs** is enabled, the inference output includes the log-probability of each token, which **significantly increases the size of each message body**. Under default settings, this may exceed the limits of the **System V Message Queue**, causing the inference request to **stall**.
78+
79+
The increase in message size differs between MTP and non-MTP modes. The calculations are shown below.
80+
81+
### Message Size Calculation
82+
83+
1. **Non-MTP + logprobs enabled**
84+
Size of a single message:
85+
86+
```
87+
((512 * (20 + 1)) + 2) * 8
88+
+ 512 * (20 + 1) * 4
89+
+ 512 * 8
90+
= 133136 bytes
91+
```
92+
93+
2. **MTP + logprobs enabled**
94+
Size of a single message:
95+
96+
```
97+
(512 * 6 * (20 + 1) + 512 + 3) * 8
98+
+ 512 * 6 * (20 + 1) * 4
99+
+ 512 * 6 * 8
100+
= 802840 bytes
101+
```
102+
103+
### Root Cause
104+
105+
Running `ipcs -l` typically shows the default System V message queue limits:
106+
107+
```
108+
------ Messages Limits --------
109+
max queues system wide = 32000
110+
max size of message (bytes) = 8192
111+
default max size of queue (bytes) = 16384
112+
```
113+
114+
If a single message **exceeds the `max size of message` limit (usually 8192 bytes)**, inter-process communication becomes blocked, causing the inference task to stall.
115+
116+
### Solution
117+
118+
**Increase the System V message queue size limits.**
119+
120+
Since message sizes can approach 800 KB in MTP mode, it is recommended to increase the **maximum message size to at least 1 MB (1048576 bytes)**.
121+
122+
Use the following commands on Linux:
123+
124+
```
125+
# Increase maximum size of a single message
126+
sysctl -w kernel.msgmax=1048576
127+
128+
# Increase maximum capacity of a message queue
129+
sysctl -w kernel.msgmnb=268435456
130+
```
131+
132+
> **Note:** If running inside a Docker container, privileged mode (`--privileged`) is required, or you must explicitly set these kernel parameters via container startup options.
133+
134+
### Deprecation Notice
135+
136+
This System V message queue–based communication mechanism will be **deprecated in future releases**. Subsequent versions will migrate to a more robust communication method that eliminates the limitations described above.

docs/features/structured_outputs.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
Structured Outputs refer to predefined format constraints that force large language models to generate content strictly following specified structures. This feature significantly improves output controllability and is suitable for scenarios requiring precise format outputs (such as API calls, data parsing, code generation, etc.), while supporting dynamic grammar extensions to balance flexibility and standardization.
88

99
FastDeploy supports using the [XGrammar](https://xgrammar.mlc.ai/docs/) backend to generate structured outputs.
10+
FastDeploy supports using the [LLguidance](https://github.com/guidance-ai/llguidance) backend to generate structured outputs.
1011

1112
Supported output formats:
1213

docs/parameters.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ When using FastDeploy to deploy models (including offline inference and service
4444
| ```disable_sequence_parallel_moe``` | `bool` | Disable sequence parallel moe, default: False |
4545
| ```splitwise_role``` | `str` | Whether to enable splitwise inference, default value: mixed, supported parameters: ["mixed", "decode", "prefill"] |
4646
| ```innode_prefill_ports``` | `str` | Internal engine startup ports for prefill instances (only required for single-machine PD separation), default: None |
47-
| ```guided_decoding_backend``` | `str` | Specify the guided decoding backend to use, supports `auto`, `xgrammar`, `off`, default: `off` |
47+
| ```guided_decoding_backend``` | `str` | Specify the guided decoding backend to use, supports `auto`, `xgrammar`, `guidance`, `off`, default: `off` |
4848
| ```guided_decoding_disable_any_whitespace``` | `bool` | Whether to disable whitespace generation during guided decoding, default: False |
4949
| ```speculative_config``` | `dict[str]` | Speculative decoding configuration, only supports standard format JSON string, default: None |
5050
| ```dynamic_load_weight``` | `int` | Whether to enable dynamic weight loading, default: 0 |

docs/zh/best_practices/FAQ.md

Lines changed: 88 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,95 @@ export ENABLE_V1_KVCACHE_SCHEDULER=1
3939
2. 检查自动profile分配的KVCache block是否符合预期,如果自动profile中受到显存波动影响可能导致分配偏少,可以通过手工设置`num_gpu_blocks_override`参数扩大KVCache block。
4040

4141
## 3.服务可以支持多大并发?
42-
1. 服务部署时推荐配置环境变量
42+
43+
1. 服务部署时推荐配置以下环境变量
44+
45+
```
46+
export ENABLE_V1_KVCACHE_SCHEDULER=1
47+
```
48+
49+
2. 服务启动时需要配置 `max-num-seqs`
50+
该参数表示 Decode 阶段的**最大 Batch 数**,当并发超过该值时,多余的请求会进入排队等待处理。
51+
一般情况下,你可以将 `max-num-seqs` 配置为 **128**,保持在较高范围;实际并发能力由压测客户端决定。
52+
53+
3. `max-num-seqs` 仅表示**配置的上限**,但服务真正能支持的并发量取决于 **KVCache 的总大小**
54+
服务启动后,在 `log/worker_process.log` 中会看到类似:
55+
56+
```
57+
num_blocks_global: 17131
58+
```
59+
60+
这表示当前服务的 KVCache Block 数量为 **17131**,若 `block_size = 64`(默认),则可缓存 Token 总量为:
61+
62+
```
63+
17131 * 64 = 1,096,384 tokens
64+
```
65+
66+
如果你的请求平均(输入 + 输出)为 **20K tokens**,那么服务实际能支持的并发大约为:
67+
68+
```
69+
1,096,384 / 20,000 ≈ 53
70+
```
71+
72+
## 4. 启用 logprobs 后推理请求卡住
73+
74+
启用 **logprobs** 后,推理结果会附带每个 token 的logprobs信息,使**单条消息体显著变大**。在默认配置下,这可能触发 **System V Message Queue** 的消息大小限制,从而导致推理任务token输出**卡住**
75+
76+
不同模式下(MTP / 非 MTP)logprobs 会导致消息体膨胀的规模不同,具体计算如下。
77+
78+
### 消息体大小计算
79+
80+
1. **非 MTP 模式 + logprobs**
81+
单条消息体大小:
82+
83+
```
84+
((512 * (20 + 1)) + 2) * 8
85+
+ 512 * (20 + 1) * 4
86+
+ 512 * 8
87+
= 133136 bytes
88+
```
89+
90+
2. **MTP 模式 + logprobs**
91+
单条消息体大小:
92+
93+
```
94+
(512 * 6 * (20 + 1) + 512 + 3) * 8
95+
+ 512 * 6 * (20 + 1) * 4
96+
+ 512 * 6 * 8
97+
= 802840 bytes
98+
```
99+
100+
### 问题原因
101+
102+
通过 `ipcs -l` 查看系统默认的 System V 消息队列限制,常见设置如下:
103+
43104
```
44-
export ENABLE_V1_KVCACHE_SCHEDULER=1
105+
------ Messages Limits --------
106+
max queues system wide = 32000
107+
max size of message (bytes) = 8192
108+
default max size of queue (bytes) = 16384
45109
```
46110

47-
2. 服务在启动时需要配置max-num-seqs,此参数用于表示Decode阶段的最大Batch数,如果并发超过此值,则超出的请求会排队等待处理, 常规情况下你可以将max-num-seqs配置为128,保持在较高的范围,实际并发由发压客户端来决定。
111+
当单条消息体大小**超过 max size of message(默认 8192 bytes)** 时,进程间通信会被阻塞,最终表现为推理请求卡住。
112+
113+
### 解决方案
114+
115+
**调大 System V Message Queue 的消息大小限制。**
116+
117+
由于 MTP 下的消息体可接近 800 KB,建议将**单条消息大小限制提升至 1MB(1048576 bytes)**
118+
119+
Linux 系统可通过以下命令调整:
120+
121+
```
122+
# 提高单条消息的最大允许大小
123+
sysctl -w kernel.msgmax=1048576
124+
125+
# 提高单个消息队列的最大容量
126+
sysctl -w kernel.msgmnb=268435456
127+
```
128+
129+
> **注意**: 若在 Docker 容器中运行,需要启用特权模式(`--privileged`),或在启动参数中显式设置相关内核参数。
130+
131+
### 废弃说明
48132

49-
3. max-num-seqs仅表示设定的上限,但实际上服务能并发处理的上限取决于KVCache的大小,在启动服务后,查看log/worker_process.log会看到类似num_blocks_global: 17131的日志,这表明当前服务的KVCache Block数量为17131, 17131block_size(默认64)即知道总共可缓存的Token数量,例如此处为1713164=1096384。如果你的请求数据平均输入和输出Token之和为20K,那么服务实际可以处理的并发大概为1096384/20k=53
133+
当前基于 System V Message Queue 的通信机制将在后续版本中被废弃。未来将迁移到更稳定、更高效的通信方式,以彻底解决上述限制问题。

docs/zh/features/structured_outputs.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
Structured Outputs 是指通过预定义格式约束,使大模型生成内容严格遵循指定结构。该功能可显著提升生成结果的可控性,适用于需要精确格式输出的场景(如API调用、数据解析、代码生成等),同时支持动态语法扩展,平衡灵活性与规范性。
88

99
FastDeploy 支持使用 [XGrammar](https://xgrammar.mlc.ai/docs/) 后端生成结构化输出。
10+
FastDeploy 支持使用 [LLguidance](https://github.com/guidance-ai/llguidance) 后端生成结构化输出。
1011

1112
支持输出格式
1213

docs/zh/parameters.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
| ```disable_sequence_parallel_moe``` | `bool` | 禁止在TP+EP中使用序列并行优化, default: False |
4343
| ```splitwise_role``` | `str` | 是否开启splitwise推理,默认值mixed, 支持参数为["mixed", "decode", "prefill"] |
4444
| ```innode_prefill_ports``` | `str` | prefill 实例内部引擎启动端口 (仅单机PD分离需要),默认值None |
45-
| ```guided_decoding_backend``` | `str` | 指定要使用的guided decoding后端,支持 `auto``xgrammar``off`, 默认为 `off` |
45+
| ```guided_decoding_backend``` | `str` | 指定要使用的guided decoding后端,支持 `auto``xgrammar` `guidance``off`, 默认为 `off` |
4646
| ```guided_decoding_disable_any_whitespace``` | `bool` | guided decoding期间是否禁止生成空格,默认False |
4747
| ```speculative_config``` | `dict[str]` | 投机解码配置,仅支持标准格式json字符串,默认为None |
4848
| ```dynamic_load_weight``` | `int` | 是否动态加载权重,默认0 |

0 commit comments

Comments
 (0)