PaddlePaddle · juncaipeng · Nov 14, 2025 · Nov 14, 2025 · Nov 17, 2025 · Nov 18, 2025
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -12,6 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <stdexcept>
+#include <string>
+
 #include "paddle/extension.h"
 #include "pybind11/pybind11.h"
 namespace py = pybind11;
@@ -49,6 +55,51 @@ void cuda_host_free(uintptr_t ptr) {
   check_cuda_error(cudaFreeHost(reinterpret_cast<void*>(ptr)));
 }
 
+// Create a shared memory region and register it with CUDA
+// The pinned shm can be shared between processes
+uintptr_t create_pinned_shm(const char* shm_name, size_t byte_size) {
+  int fd = shm_open(shm_name, O_CREAT | O_RDWR, 0666);
+  if (fd < 0) throw std::runtime_error("shm_open failed");
+
+  if (ftruncate(fd, byte_size) != 0)
+    throw std::runtime_error("ftruncate failed");
+
+  void* addr =
+      mmap(nullptr, byte_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+  if (addr == MAP_FAILED) throw std::runtime_error("mmap failed");
+
+  check_cuda_error(cudaHostRegister(addr, byte_size, cudaHostRegisterPortable));
+
+  close(fd);
+  return reinterpret_cast<uintptr_t>(addr);
+}
+
+uintptr_t open_pinned_shm(const char* shm_name, size_t byte_size) {
+  int fd = shm_open(shm_name, O_RDWR, 0666);
+  if (fd < 0) throw std::runtime_error("shm_open failed");
+
+  void* addr =
+      mmap(nullptr, byte_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+  if (addr == MAP_FAILED) throw std::runtime_error("mmap failed");
+
+  check_cuda_error(cudaHostRegister(addr, byte_size, cudaHostRegisterPortable));
+
+  close(fd);
+  return reinterpret_cast<uintptr_t>(addr);
+}
+
+void free_pinned_shm(const char* shm_name,
+                     uintptr_t addr_uint,
+                     size_t byte_size) {
+  void* addr = reinterpret_cast<void*>(addr_uint);
+
+  check_cuda_error(cudaHostUnregister(addr));
+
+  if (munmap(addr, byte_size) != 0) throw std::runtime_error("munmap failed");
+
+  if (shm_unlink(shm_name) != 0) throw std::runtime_error("shm_unlink failed");
+}
+
 std::vector<paddle::Tensor> AppendAttention(
     const paddle::Tensor& qkv,
     const paddle::Tensor& key_cache,
@@ -1131,6 +1182,22 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("flags") = cudaHostAllocDefault);
   m.def(
       "cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr"));
+  m.def("create_pinned_shm",
+        &create_pinned_shm,
+        "Allocate pinned memory for supporting inter process communication",
+        py::arg("name"),
+        py::arg("byte_size"));
+  m.def("open_pinned_shm",
+        &open_pinned_shm,
+        "Open pinned memory which has been allocated by another process",
+        py::arg("name"),
+        py::arg("byte_size"));
+  m.def("free_pinned_shm",
+        &free_pinned_shm,
+        "Free pinned memory which supports inter process communication",
+        py::arg("name"),
+        py::arg("addr_uint"),
+        py::arg("byte_size"));
   py::register_exception<CudaError>(m, "CudaError");
   /**
    * append_attention.cu

diff --git a/examples/splitwise/README.md b/examples/splitwise/README.md
@@ -0,0 +1,36 @@
+# Run the Examples on NVIDIA CUDA GPU
+
+## Prepare the Environment
+Refer to [NVIDIA CUDA GPU Installation](https://paddlepaddle.github.io/FastDeploy/get_started/installation/nvidia_gpu/) to pull the docker image, such as:
+```
+docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.3.0
+```
+
+In the docker container, the [NVIDIA MLNX_OFED](https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/) and [Redis](https://redis.io/) are pre-installed.
+
+## Build and install FastDeploy
+
+```
+git clone https://github.com/PaddlePaddle/FastDeploy
+cd FastDeploy
+
+export ENABLE_FD_RDMA=1
+
+# Argument 1: Whether to build wheel package (1 for yes, 0 for compile only)
+# Argument 2: Python interpreter path
+# Argument 3: Whether to compile CPU inference operators
+# Argument 4: Target GPU architectures
+bash build.sh 1 python false [80,90]
+```
+
+## Run the Examples
+
+Run the shell scripts in this directory, ```bash start_v0_tp1.sh``` or ```bash start_v1_tp1.sh```
+
+Note that, there are two methods for splitwise deployment:
+* v0: using splitwise_scheduler or dp_scheduler, in which the requests are scheduled in the engine.
+* v1: using router, in which the requests are scheduled in the router.
+
+# Run the Examples on Kunlunxin XPU
+
+Coming soon...
diff --git a/examples/splitwise/start_mixed.sh b/examples/splitwise/start_mixed.sh
@@ -3,41 +3,35 @@ set -e
 
 # Test mixed server + router
 
-wait_for_health() {
-       local server_port=$1
-       while true; do
-       status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
-       if [ "$status_code" -eq 200 ]; then
-              break
-       else
-              echo "Service not ready. Retrying in 2s..."
-              sleep 2
-       fi
-       done
-}
-
 # prepare environment
-MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
-
+export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
 export FD_DEBUG=1
-export ENABLE_V1_KVCACHE_SCHEDULER=0
-export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
 
 unset http_proxy && unset https_proxy
 rm -rf log_*
+source ./utils.sh
 
 S1_PORT=52400
 S2_PORT=52500
 ROUTER_PORT=52600
 
+ports=(
+    $S1_PORT $((S1_PORT + 1)) $((S1_PORT + 2)) $((S1_PORT + 3))
+    $S2_PORT $((S2_PORT + 1)) $((S2_PORT + 2)) $((S2_PORT + 3))
+    $ROUTER_PORT
+)
+check_ports "${ports[@]}" || {
+    echo "❌ Some ports are in use. Please release them."
+    exit 1
+}
+
 # start router
 export FD_LOG_DIR="log_router"
 mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.router.launch \
     --port ${ROUTER_PORT} \
     2>&1 >${FD_LOG_DIR}/nohup &
-sleep 1
 
 # start modelserver 0
 export CUDA_VISIBLE_DEVICES=0
@@ -53,7 +47,6 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
        --max-model-len 32768 \
        --router "0.0.0.0:${ROUTER_PORT}" \
        2>&1 >${FD_LOG_DIR}/nohup &
-sleep 1
 
 wait_for_health ${S1_PORT}
 
@@ -76,12 +69,13 @@ wait_for_health ${S2_PORT}
 
 # send request
 sleep 10  # make sure server is registered to router
+echo "send request..."
 curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
 -H "Content-Type: application/json" \
 -d '{
   "messages": [
     {"role": "user", "content": "hello"}
   ],
   "max_tokens": 20,
-  "stream": true
+  "stream": false
 }'
diff --git a/examples/splitwise/start_v0_tp1.sh b/examples/splitwise/start_v0_tp1.sh
@@ -6,22 +6,8 @@ set -e
 # v0: using splitwise_scheduler or dp_scheduler
 # v1: using local_scheduler + router
 
-wait_for_health() {
-       local server_port=$1
-       while true; do
-       status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
-       if [ "$status_code" -eq 200 ]; then
-              break
-       else
-              echo "Service not ready. Retrying in 2s..."
-              sleep 2
-       fi
-       done
-}
-
 # prepare environment
-MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
-
+export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
 export FD_DEBUG=1
 export ENABLE_V1_KVCACHE_SCHEDULER=1
 export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
@@ -37,10 +23,21 @@ fi
 
 unset http_proxy && unset https_proxy
 rm -rf log_*
+source ./utils.sh
 
 P_PORT=52400
 D_PORT=52500
-REDIS_PORT=56388
+REDIS_PORT="${REDIS_PORT:-56388}"
+
+ports=(
+    $P_PORT $((P_PORT + 1)) $((P_PORT + 2)) $((P_PORT + 3)) $((P_PORT + 4)) $((P_PORT + 5))
+    $D_PORT $((D_PORT + 1)) $((D_PORT + 2)) $((D_PORT + 3)) $((D_PORT + 4)) $((D_PORT + 5))
+    $REDIS_PORT
+)
+check_ports "${ports[@]}" || {
+    echo "❌ Some ports are in use. Please release them."
+    exit 1
+}
 
 # start redis
 if ! redis-cli -p ${REDIS_PORT} ping &>/dev/null; then
@@ -104,12 +101,13 @@ wait_for_health ${D_PORT}
 
 # send request
 sleep 10  # make sure server is registered to router
+echo "send request..."
 curl -X POST "http://0.0.0.0:${D_PORT}/v1/chat/completions" \
 -H "Content-Type: application/json" \
 -d '{
   "messages": [
     {"role": "user", "content": "hello"}
   ],
   "max_tokens": 20,
-  "stream": true
+  "stream": false
 }'
diff --git a/examples/splitwise/start_v0_tp2.sh b/examples/splitwise/start_v0_tp2.sh