PaddlePaddle
diff --git a/‎build.sh‎
Lines changed: 3 additions & 0 deletions b/‎build.sh‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/get_output.cc‎
Lines changed: 39 additions & 40 deletions b/‎custom_ops/gpu_ops/get_output.cc‎
Lines changed: 39 additions & 40 deletions
diff --git a/‎custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu‎
Lines changed: 14 additions & 2 deletions b/‎custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎custom_ops/xpu_ops/build.sh‎
Lines changed: 3 additions & 0 deletions b/‎custom_ops/xpu_ops/build.sh‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/best_practices/FAQ.md‎
Lines changed: 97 additions & 0 deletions b/‎docs/best_practices/FAQ.md‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎docs/features/structured_outputs.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/features/structured_outputs.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/parameters.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/parameters.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/zh/best_practices/FAQ.md‎
Lines changed: 88 additions & 4 deletions b/‎docs/zh/best_practices/FAQ.md‎
Lines changed: 88 additions & 4 deletions
diff --git a/‎docs/zh/features/structured_outputs.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/zh/features/structured_outputs.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/zh/parameters.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/zh/parameters.md‎
Lines changed: 1 addition & 1 deletion
@@ -96,6 +96,9 @@ function copy_ops(){
         TMP_PACKAGE_DIR="${LEGACY_PACKAGE_DIR}"
     else
         echo -e "${RED}[Error]${NONE} Neither modern nor legacy directory for gpu ops found in ${OPS_TMP_DIR}"
+        echo -e "${BLUE}[Info]${NONE} Maybe the compilation failed, please clean the build directory (currently ${BUILD_DIR}) and egg directory (currently ${EGG_DIR}) and try again."
+        echo -e "${BLUE}[Info]${NONE} If the build still fails, please try to use a clean FastDeploy code and a clean environment to compile again."
+        exit 1
     fi
 
     # Handle CPU ops directory compatibility between modern and legacy naming
 
@@ -26,71 +26,70 @@
 #define MAX_BSZ 512
 // #define GET_OUTPUT_DEBUG
 struct msgdata {
-    long mtype;
-    int mtext[MAX_BSZ + 2];  // stop_flag, bsz, tokens
+  long mtype;
+  int mtext[MAX_BSZ + 2];  // stop_flag, bsz, tokens
 };
 
 void GetOutput(const paddle::Tensor& x,
                int64_t rank_id,
                bool wait_flag,
                int msg_queue_id) {
-    if (rank_id > 0) {
-        return;
-    }
-    static struct msgdata msg_rcv;
-    if (const char* inference_msg_queue_id_env_p =
-            std::getenv("INFERENCE_MSG_QUEUE_ID")) {
-        std::string inference_msg_queue_id_env_str(
-            inference_msg_queue_id_env_p);
-        int inference_msg_queue_id_from_env =
-            std::stoi(inference_msg_queue_id_env_str);
+  if (rank_id > 0) {
+    return;
+  }
+  static struct msgdata msg_rcv;
+  if (const char* inference_msg_queue_id_env_p =
+          std::getenv("INFERENCE_MSG_QUEUE_ID")) {
+    std::string inference_msg_queue_id_env_str(inference_msg_queue_id_env_p);
+    int inference_msg_queue_id_from_env =
+        std::stoi(inference_msg_queue_id_env_str);
 #ifdef GET_OUTPUT_DEBUG
-        std::cout << "Your INFERENCE_MSG_QUEUE_ID is: "
-                  << inference_msg_queue_id_from_env << std::endl;
+    std::cout << "Your INFERENCE_MSG_QUEUE_ID is: "
+              << inference_msg_queue_id_from_env << std::endl;
 #endif
-        msg_queue_id = inference_msg_queue_id_from_env;
-    }
-    static key_t key = ftok("/dev/shm", msg_queue_id);
-    static int msgid = msgget(key, IPC_CREAT | 0666);
+    msg_queue_id = inference_msg_queue_id_from_env;
+  }
+  static key_t key = ftok("/dev/shm", msg_queue_id);
+  static int msgid = msgget(key, IPC_CREAT | 0666);
 
 #ifdef GET_OUTPUT_DEBUG
-    std::cout << "get_output_key: " << key << std::endl;
-    std::cout << "get_output msgid: " << msgid << std::endl;
+  std::cout << "get_output_key: " << key << std::endl;
+  std::cout << "get_output msgid: " << msgid << std::endl;
 #endif
 
-    int64_t* out_data = const_cast<int64_t*>(x.data<int64_t>());
-    int ret = -1;
-    if (!wait_flag) {
-        ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ + 2) * 4, 0, IPC_NOWAIT);
-    } else {
-        ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ + 2) * 4, 0, 0);
-    }
-    if (ret == -1) {
-        out_data[0] = -2;
-        out_data[1] = 0;
-        return;
-    }
-    int bsz = msg_rcv.mtext[1];
+  int64_t* out_data = const_cast<int64_t*>(x.data<int64_t>());
+  int ret = -1;
+  if (!wait_flag) {
+    ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ + 2) * 4, 0, IPC_NOWAIT);
+  } else {
+    ret = msgrcv(msgid, &msg_rcv, (MAX_BSZ + 2) * 4, 0, 0);
+  }
+  if (ret == -1) {
+    out_data[0] = -2;
+    out_data[1] = 0;
+    return;
+  }
+  int bsz = msg_rcv.mtext[1];
 
-    for (int64_t i = 0; i < bsz + 2; i++) {
-        out_data[i] = (int64_t)msg_rcv.mtext[i];
-    }
+  for (int64_t i = 0; i < bsz + 2; i++) {
+    out_data[i] = (int64_t)msg_rcv.mtext[i];
+  }
 #ifdef GET_OUTPUT_DEBUG
-    std::cout << "get_output finished: " << msgid << std::endl;
+  std::cout << "get_output finished: " << msgid << std::endl;
 #endif
 
-    return;
+  return;
 }
 
 void GetOutputStatic(const paddle::Tensor& x, int64_t rank_id, bool wait_flag) {
-    GetOutput(x, rank_id, wait_flag, 1);
+  GetOutput(x, rank_id, wait_flag, 1);
 }
 
 void GetOutputDynamic(const paddle::Tensor& x,
                       int64_t rank_id,
                       bool wait_flag,
                       int msg_queue_id) {
-    GetOutput(x, rank_id, wait_flag, msg_queue_id);
+  GetOutput(x, rank_id, wait_flag, msg_queue_id);
 }
 
 PD_BUILD_STATIC_OP(get_output)
 
@@ -989,8 +989,20 @@ std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
                      paddle::DataType::FLOAT32,
                      place);
 
-  auto m_indices =
-      GetEmptyTensor({token_nums_feed_to_ffn}, paddle::DataType::INT32, place);
+  paddle::Tensor m_indices;
+  if (use_in_ep) {
+    m_indices = GetEmptyTensor(
+        {token_nums_feed_to_ffn}, paddle::DataType::INT32, place);
+  } else {
+    // Note(ZKK)
+    // In TP, we must init m_indices with -1,
+    // because we allocate too much space.
+    // token_rows * moe_topk + num_experts_per_rank * (128 - 1)
+    // Later will optimize this.
+    m_indices = paddle::full(
+        {token_nums_feed_to_ffn}, -1, paddle::DataType::INT32, place);
+  }
+
   auto token_nums_per_expert_cumsum =
       GetEmptyTensor({num_experts_per_rank}, paddle::DataType::INT64, place);
   auto token_nums_per_expert_padded_cumsum =
 
@@ -54,6 +54,9 @@ elif [ -d "${LEGACY_PACKAGE_DIR}" ]; then
     CUSTOM_OP_DLL_PATH="${TMP_PACKAGE_DIR}/fastdeploy_ops_pd_.so"
 else
     echo -e "${RED}[Error]${NONE} Neither modern nor legacy directory for xpu ops found in ${OPS_TMP_DIR}"
+    echo -e "${BLUE}[Info]${NONE} Maybe the compilation failed, please clean the build directory and try again."
+    echo -e "${BLUE}[Info]${NONE} If the build still fails, please try to use a clean FastDeploy code and a clean environment to compile again."
+    exit 1
 fi
 
 mkdir -p ${TMP_PACKAGE_DIR}/libs
 
@@ -37,3 +37,100 @@ export ENABLE_V1_KVCACHE_SCHEDULER=1
 ```
 
 2. Check whether the KVCache blocks allocated by the automatic profile are as expected. If the automatic profile is affected by the fluctuation of video memory and may result in less allocation, you can manually set the `num_gpu_blocks_override` parameter to expand the KVCache block.
+
+## 3. How much concurrency can the service support?
+
+1. It is recommended to configure the following environment variable when deploying the service:
+
+   ```
+   export ENABLE_V1_KVCACHE_SCHEDULER=1
+   ```
+
+2. When starting the service, you need to configure `max-num-seqs`.
+   This parameter specifies the maximum batch size during the Decode phase.
+   If the concurrency exceeds this value, the extra requests will be queued.
+   Under normal circumstances, you can set `max-num-seqs` to **128** to keep it relatively high; the actual concurrency is determined by the load-testing client.
+
+3. `max-num-seqs` represents only the upper limit you configure.
+   The **actual** concurrency the service can handle depends on the size of the **KVCache**.
+   After the service starts, check `log/worker_process.log` and look for logs similar to:
+
+   ```
+   num_blocks_global: 17131
+   ```
+
+   This indicates that the current service has **17131 KVCache blocks**.
+   With `block_size = 64` (default), the total number of tokens that can be cached is:
+
+   ```
+   17131 * 64 = 1,096,384 tokens
+   ```
+
+   If the average total number of tokens per request (input + output) is **20K**, then the service can actually support approximately:
+
+   ```
+   1,096,384 / 20,000 ≈ 53 concurrent requests
+   ```
+
+## 4. Inference Request Stalls After Enabling logprobs
+
+When **logprobs** is enabled, the inference output includes the log-probability of each token, which **significantly increases the size of each message body**. Under default settings, this may exceed the limits of the **System V Message Queue**, causing the inference request to **stall**.
+
+The increase in message size differs between MTP and non-MTP modes. The calculations are shown below.
+
+### Message Size Calculation
+
+1. **Non-MTP + logprobs enabled**
+   Size of a single message:
+
+   ```
+   ((512 * (20 + 1)) + 2) * 8
+   + 512 * (20 + 1) * 4
+   + 512 * 8
+   = 133136 bytes
+   ```
+
+2. **MTP + logprobs enabled**
+   Size of a single message:
+
+   ```
+   (512 * 6 * (20 + 1) + 512 + 3) * 8
+   + 512 * 6 * (20 + 1) * 4
+   + 512 * 6 * 8
+   = 802840 bytes
+   ```
+
+### Root Cause
+
+Running `ipcs -l` typically shows the default System V message queue limits:
+
+```
+------ Messages Limits --------
+max queues system wide = 32000
+max size of message (bytes) = 8192
+default max size of queue (bytes) = 16384
+```
+
+If a single message **exceeds the `max size of message` limit (usually 8192 bytes)**, inter-process communication becomes blocked, causing the inference task to stall.
+
+### Solution
+
+**Increase the System V message queue size limits.**
+
+Since message sizes can approach 800 KB in MTP mode, it is recommended to increase the **maximum message size to at least 1 MB (1048576 bytes)**.
+
+Use the following commands on Linux:
+
+```
+# Increase maximum size of a single message
+sysctl -w kernel.msgmax=1048576
+
+# Increase maximum capacity of a message queue
+sysctl -w kernel.msgmnb=268435456
+```
+
+> **Note:** If running inside a Docker container, privileged mode (`--privileged`) is required, or you must explicitly set these kernel parameters via container startup options.
+
+### Deprecation Notice
+
+This System V message queue–based communication mechanism will be **deprecated in future releases**. Subsequent versions will migrate to a more robust communication method that eliminates the limitations described above.
@@ -7,6 +7,7 @@
 Structured Outputs refer to predefined format constraints that force large language models to generate content strictly following specified structures. This feature significantly improves output controllability and is suitable for scenarios requiring precise format outputs (such as API calls, data parsing, code generation, etc.), while supporting dynamic grammar extensions to balance flexibility and standardization.
 
 FastDeploy supports using the [XGrammar](https://xgrammar.mlc.ai/docs/) backend to generate structured outputs.
+FastDeploy supports using the [LLguidance](https://github.com/guidance-ai/llguidance) backend to generate structured outputs.
 
 Supported output formats:
 
 
@@ -44,7 +44,7 @@ When using FastDeploy to deploy models (including offline inference and service
 | ```disable_sequence_parallel_moe``` | `bool` | Disable sequence parallel moe, default: False |
 | ```splitwise_role``` | `str` | Whether to enable splitwise inference, default value: mixed, supported parameters: ["mixed", "decode", "prefill"] |
 | ```innode_prefill_ports``` | `str` | Internal engine startup ports for prefill instances (only required for single-machine PD separation), default: None |
-| ```guided_decoding_backend``` | `str` | Specify the guided decoding backend to use, supports `auto`, `xgrammar`, `off`, default: `off` |
+| ```guided_decoding_backend``` | `str` | Specify the guided decoding backend to use, supports `auto`, `xgrammar`, `guidance`, `off`, default: `off` |
 | ```guided_decoding_disable_any_whitespace``` | `bool` | Whether to disable whitespace generation during guided decoding, default: False |
 | ```speculative_config``` | `dict[str]` | Speculative decoding configuration, only supports standard format JSON string, default: None |
 | ```dynamic_load_weight``` | `int` | Whether to enable dynamic weight loading, default: 0 |
 
@@ -39,11 +39,95 @@ export ENABLE_V1_KVCACHE_SCHEDULER=1
 2. 检查自动profile分配的KVCache block是否符合预期，如果自动profile中受到显存波动影响可能导致分配偏少，可以通过手工设置`num_gpu_blocks_override`参数扩大KVCache block。
 
 ## 3.服务可以支持多大并发？
-1. 服务部署时推荐配置环境变量
+
+1. 服务部署时推荐配置以下环境变量
+
+   ```
+   export ENABLE_V1_KVCACHE_SCHEDULER=1
+   ```
+
+2. 服务启动时需要配置 `max-num-seqs`
+   该参数表示 Decode 阶段的**最大 Batch 数**，当并发超过该值时，多余的请求会进入排队等待处理。
+   一般情况下，你可以将 `max-num-seqs` 配置为 **128**，保持在较高范围；实际并发能力由压测客户端决定。
+
+3. `max-num-seqs` 仅表示**配置的上限**，但服务真正能支持的并发量取决于 **KVCache 的总大小**
+   服务启动后，在 `log/worker_process.log` 中会看到类似：
+
+   ```
+   num_blocks_global: 17131
+   ```
+
+   这表示当前服务的 KVCache Block 数量为 **17131**，若 `block_size = 64`（默认），则可缓存 Token 总量为：
+
+   ```
+   17131 * 64 = 1,096,384 tokens
+   ```
+
+   如果你的请求平均（输入 + 输出）为 **20K tokens**，那么服务实际能支持的并发大约为：
+
+   ```
+   1,096,384 / 20,000 ≈ 53
+   ```
+
+## 4. 启用 logprobs 后推理请求卡住
+
+启用 **logprobs** 后，推理结果会附带每个 token 的logprobs信息，使**单条消息体显著变大**。在默认配置下，这可能触发 **System V Message Queue** 的消息大小限制，从而导致推理任务token输出**卡住**。
+
+不同模式下（MTP / 非 MTP）logprobs 会导致消息体膨胀的规模不同，具体计算如下。
+
+### 消息体大小计算
+
+1. **非 MTP 模式 + logprobs**
+   单条消息体大小：
+
+   ```
+   ((512 * (20 + 1)) + 2) * 8
+   + 512 * (20 + 1) * 4
+   + 512 * 8
+   = 133136 bytes
+   ```
+
+2. **MTP 模式 + logprobs**
+   单条消息体大小：
+
+   ```
+   (512 * 6 * (20 + 1) + 512 + 3) * 8
+   + 512 * 6 * (20 + 1) * 4
+   + 512 * 6 * 8
+   = 802840 bytes
+   ```
+
+### 问题原因
+
+通过 `ipcs -l` 查看系统默认的 System V 消息队列限制，常见设置如下：
+
 ```
-export ENABLE_V1_KVCACHE_SCHEDULER=1
+------ Messages Limits --------
+max queues system wide = 32000
+max size of message (bytes) = 8192
+default max size of queue (bytes) = 16384
 ```
 
-2. 服务在启动时需要配置max-num-seqs，此参数用于表示Decode阶段的最大Batch数，如果并发超过此值，则超出的请求会排队等待处理, 常规情况下你可以将max-num-seqs配置为128，保持在较高的范围，实际并发由发压客户端来决定。
+当单条消息体大小**超过 max size of message（默认 8192 bytes）** 时，进程间通信会被阻塞，最终表现为推理请求卡住。
+
+### 解决方案
+
+**调大 System V Message Queue 的消息大小限制。**
+
+由于 MTP 下的消息体可接近 800 KB，建议将**单条消息大小限制提升至 1MB（1048576 bytes）**。
+
+Linux 系统可通过以下命令调整：
+
+```
+# 提高单条消息的最大允许大小
+sysctl -w kernel.msgmax=1048576
+
+# 提高单个消息队列的最大容量
+sysctl -w kernel.msgmnb=268435456
+```
+
+> **注意**: 若在 Docker 容器中运行，需要启用特权模式（`--privileged`），或在启动参数中显式设置相关内核参数。
+
+### 废弃说明
 
-3. max-num-seqs仅表示设定的上限，但实际上服务能并发处理的上限取决于KVCache的大小，在启动服务后，查看log/worker_process.log会看到类似num_blocks_global: 17131的日志，这表明当前服务的KVCache Block数量为17131, 17131block_size(默认64）即知道总共可缓存的Token数量，例如此处为1713164=1096384。如果你的请求数据平均输入和输出Token之和为20K，那么服务实际可以处理的并发大概为1096384/20k=53
+当前基于 System V Message Queue 的通信机制将在后续版本中被废弃。未来将迁移到更稳定、更高效的通信方式，以彻底解决上述限制问题。
@@ -7,6 +7,7 @@
 Structured Outputs 是指通过预定义格式约束，使大模型生成内容严格遵循指定结构。该功能可显著提升生成结果的可控性，适用于需要精确格式输出的场景（如API调用、数据解析、代码生成等），同时支持动态语法扩展，平衡灵活性与规范性。
 
 FastDeploy 支持使用 [XGrammar](https://xgrammar.mlc.ai/docs/) 后端生成结构化输出。
+FastDeploy 支持使用 [LLguidance](https://github.com/guidance-ai/llguidance) 后端生成结构化输出。
 
 支持输出格式
 
 
@@ -42,7 +42,7 @@
 | ```disable_sequence_parallel_moe``` | `bool` | 禁止在TP+EP中使用序列并行优化, default: False |
 | ```splitwise_role```               | `str`       | 是否开启splitwise推理，默认值mixed， 支持参数为["mixed", "decode", "prefill"] |
 | ```innode_prefill_ports```         | `str`       | prefill 实例内部引擎启动端口 （仅单机PD分离需要），默认值None |
-| ```guided_decoding_backend```      | `str`       | 指定要使用的guided decoding后端，支持 `auto`、`xgrammar`、`off`, 默认为 `off` |
+| ```guided_decoding_backend```      | `str`       | 指定要使用的guided decoding后端，支持 `auto`、`xgrammar`、 `guidance`、`off`, 默认为 `off` |
 | ```guided_decoding_disable_any_whitespace``` | `bool`   | guided decoding期间是否禁止生成空格，默认False |
 | ```speculative_config```           | `dict[str]` | 投机解码配置，仅支持标准格式json字符串，默认为None |
 | ```dynamic_load_weight```          | `int`       | 是否动态加载权重，默认0 |