Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions custom_ops/gpu_ops/cpp_extensions.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <stdexcept>
#include <string>

#include "paddle/extension.h"
#include "pybind11/pybind11.h"
namespace py = pybind11;
Expand Down Expand Up @@ -49,6 +55,51 @@ void cuda_host_free(uintptr_t ptr) {
check_cuda_error(cudaFreeHost(reinterpret_cast<void*>(ptr)));
}

// Create a shared memory region and register it with CUDA
// The pinned shm can be shared between processes
uintptr_t create_pinned_shm(const char* shm_name, size_t byte_size) {
int fd = shm_open(shm_name, O_CREAT | O_RDWR, 0666);
if (fd < 0) throw std::runtime_error("shm_open failed");

if (ftruncate(fd, byte_size) != 0)
throw std::runtime_error("ftruncate failed");

void* addr =
mmap(nullptr, byte_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (addr == MAP_FAILED) throw std::runtime_error("mmap failed");

check_cuda_error(cudaHostRegister(addr, byte_size, cudaHostRegisterPortable));

close(fd);
return reinterpret_cast<uintptr_t>(addr);
}

uintptr_t open_pinned_shm(const char* shm_name, size_t byte_size) {
int fd = shm_open(shm_name, O_RDWR, 0666);
if (fd < 0) throw std::runtime_error("shm_open failed");

void* addr =
mmap(nullptr, byte_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (addr == MAP_FAILED) throw std::runtime_error("mmap failed");

check_cuda_error(cudaHostRegister(addr, byte_size, cudaHostRegisterPortable));

close(fd);
return reinterpret_cast<uintptr_t>(addr);
}

void free_pinned_shm(const char* shm_name,
uintptr_t addr_uint,
size_t byte_size) {
void* addr = reinterpret_cast<void*>(addr_uint);

check_cuda_error(cudaHostUnregister(addr));

if (munmap(addr, byte_size) != 0) throw std::runtime_error("munmap failed");

if (shm_unlink(shm_name) != 0) throw std::runtime_error("shm_unlink failed");
}

std::vector<paddle::Tensor> AppendAttention(
const paddle::Tensor& qkv,
const paddle::Tensor& key_cache,
Expand Down Expand Up @@ -1131,6 +1182,22 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
py::arg("flags") = cudaHostAllocDefault);
m.def(
"cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr"));
m.def("create_pinned_shm",
&create_pinned_shm,
"Allocate pinned memory for supporting inter process communication",
py::arg("name"),
py::arg("byte_size"));
m.def("open_pinned_shm",
&open_pinned_shm,
"Open pinned memory which has been allocated by another process",
py::arg("name"),
py::arg("byte_size"));
m.def("free_pinned_shm",
&free_pinned_shm,
"Free pinned memory which supports inter process communication",
py::arg("name"),
py::arg("addr_uint"),
py::arg("byte_size"));
py::register_exception<CudaError>(m, "CudaError");
/**
* append_attention.cu
Expand Down
36 changes: 36 additions & 0 deletions examples/splitwise/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Run the Examples on NVIDIA CUDA GPU

## Prepare the Environment
Refer to [NVIDIA CUDA GPU Installation](https://paddlepaddle.github.io/FastDeploy/get_started/installation/nvidia_gpu/) to pull the docker image, such as:
```
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.3.0
```

In the docker container, the [NVIDIA MLNX_OFED](https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/) and [Redis](https://redis.io/) are pre-installed.

## Build and install FastDeploy

```
git clone https://github.com/PaddlePaddle/FastDeploy
cd FastDeploy

export ENABLE_FD_RDMA=1

# Argument 1: Whether to build wheel package (1 for yes, 0 for compile only)
# Argument 2: Python interpreter path
# Argument 3: Whether to compile CPU inference operators
# Argument 4: Target GPU architectures
bash build.sh 1 python false [80,90]
```

## Run the Examples

Run the shell scripts in this directory, ```bash start_v0_tp1.sh``` or ```bash start_v1_tp1.sh```

Note that, there are two methods for splitwise deployment:
* v0: using splitwise_scheduler or dp_scheduler, in which the requests are scheduled in the engine.
* v1: using router, in which the requests are scheduled in the router.

# Run the Examples on Kunlunxin XPU

Coming soon...
34 changes: 14 additions & 20 deletions examples/splitwise/start_mixed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,35 @@ set -e

# Test mixed server + router

wait_for_health() {
local server_port=$1
while true; do
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
if [ "$status_code" -eq 200 ]; then
break
else
echo "Service not ready. Retrying in 2s..."
sleep 2
fi
done
}

# prepare environment
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"

export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
export FD_DEBUG=1
export ENABLE_V1_KVCACHE_SCHEDULER=0
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1

unset http_proxy && unset https_proxy
rm -rf log_*
source ./utils.sh

S1_PORT=52400
S2_PORT=52500
ROUTER_PORT=52600

ports=(
$S1_PORT $((S1_PORT + 1)) $((S1_PORT + 2)) $((S1_PORT + 3))
$S2_PORT $((S2_PORT + 1)) $((S2_PORT + 2)) $((S2_PORT + 3))
$ROUTER_PORT
)
check_ports "${ports[@]}" || {
echo "❌ Some ports are in use. Please release them."
exit 1
}

# start router
export FD_LOG_DIR="log_router"
mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.router.launch \
--port ${ROUTER_PORT} \
2>&1 >${FD_LOG_DIR}/nohup &
sleep 1

# start modelserver 0
export CUDA_VISIBLE_DEVICES=0
Expand All @@ -53,7 +47,6 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
--max-model-len 32768 \
--router "0.0.0.0:${ROUTER_PORT}" \
2>&1 >${FD_LOG_DIR}/nohup &
sleep 1

wait_for_health ${S1_PORT}

Expand All @@ -76,12 +69,13 @@ wait_for_health ${S2_PORT}

# send request
sleep 10 # make sure server is registered to router
echo "send request..."
curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user", "content": "hello"}
],
"max_tokens": 20,
"stream": true
"stream": false
}'
32 changes: 15 additions & 17 deletions examples/splitwise/start_v0_tp1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,8 @@ set -e
# v0: using splitwise_scheduler or dp_scheduler
# v1: using local_scheduler + router

wait_for_health() {
local server_port=$1
while true; do
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
if [ "$status_code" -eq 200 ]; then
break
else
echo "Service not ready. Retrying in 2s..."
sleep 2
fi
done
}

# prepare environment
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"

export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
export FD_DEBUG=1
export ENABLE_V1_KVCACHE_SCHEDULER=1
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
Expand All @@ -37,10 +23,21 @@ fi

unset http_proxy && unset https_proxy
rm -rf log_*
source ./utils.sh

P_PORT=52400
D_PORT=52500
REDIS_PORT=56388
REDIS_PORT="${REDIS_PORT:-56388}"

ports=(
$P_PORT $((P_PORT + 1)) $((P_PORT + 2)) $((P_PORT + 3)) $((P_PORT + 4)) $((P_PORT + 5))
$D_PORT $((D_PORT + 1)) $((D_PORT + 2)) $((D_PORT + 3)) $((D_PORT + 4)) $((D_PORT + 5))
$REDIS_PORT
)
check_ports "${ports[@]}" || {
echo "❌ Some ports are in use. Please release them."
exit 1
}

# start redis
if ! redis-cli -p ${REDIS_PORT} ping &>/dev/null; then
Expand Down Expand Up @@ -104,12 +101,13 @@ wait_for_health ${D_PORT}

# send request
sleep 10 # make sure server is registered to router
echo "send request..."
curl -X POST "http://0.0.0.0:${D_PORT}/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user", "content": "hello"}
],
"max_tokens": 20,
"stream": true
"stream": false
}'
111 changes: 0 additions & 111 deletions examples/splitwise/start_v0_tp2.sh

This file was deleted.

Loading
Loading