PaddlePaddle · RichardWooSJTU · Aug 4, 2025 · Aug 1, 2025 · Aug 1, 2025 · Aug 4, 2025
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -525,6 +525,12 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decod
             num_tokens // batch_size,
             self.parallel_config.max_model_len - max_dec_len,
         )
+
+        # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
+        # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
+        if self.fd_config.parallel_config.enable_expert_parallel:
+            full_length = min(full_length, 32)
+
         input_length = int(full_length * self.cache_config.kv_cache_ratio)
         block_num = (
             input_length + self.cache_config.block_size - 1