Skip to content

Commit 0f2cfd3

Browse files
committed
Add splitwise deployment with using rdma
1 parent 98f1ab4 commit 0f2cfd3

File tree

4 files changed

+659
-5
lines changed

4 files changed

+659
-5
lines changed

.github/workflows/_unit_test_coverage.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ jobs:
105105
FD_CACHE_QUEUE_PORT=$((8098 + DEVICE_PORT * 100))
106106
FD_ROUTER_PORT=$((8048 + DEVICE_PORT * 100))
107107
FD_CONNECTOR_PORT=$((8038 + DEVICE_PORT * 100))
108+
FD_RDMA_PORT=$((9008 + DEVICE_PORT * 100))
108109
echo "Test ENV Parameter:"
109110
echo "========================================================="
110111
echo "FLASK_PORT=${FLASK_PORT}"
@@ -114,6 +115,7 @@ jobs:
114115
echo "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}"
115116
echo "FD_ROUTER_PORT=${FD_ROUTER_PORT}"
116117
echo "FD_CONNECTOR_PORT=${FD_CONNECTOR_PORT}"
118+
echo "FD_RDMA_PORT=${FD_RDMA_PORT}"
117119
echo "DEVICES=${DEVICES}"
118120
echo "========================================================="
119121
@@ -165,6 +167,7 @@ jobs:
165167
-e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \
166168
-e "FD_ROUTER_PORT=${FD_ROUTER_PORT}" \
167169
-e "FD_CONNECTOR_PORT=${FD_CONNECTOR_PORT}" \
170+
-e "FD_RDMA_PORT=${FD_RDMA_PORT}" \
168171
-e TZ="Asia/Shanghai" \
169172
-e "fd_wheel_url=${fd_wheel_url}" \
170173
-e "BASE_REF=${BASE_REF}" \

tests/e2e/test_ernie_03b_pd_router_v1.py renamed to tests/e2e/test_ernie_03b_pd_router_v1_ipc.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
# Test splitwise deployment which uses local_scheduler + router,
16-
# and ENABLE_V1_KVCACHE_SCHEDULER is 1
15+
# Test splitwise deployment: use local_scheduler + router,
16+
# set ENABLE_V1_KVCACHE_SCHEDULER is 1, use ipc to transfer cache.
1717

1818
import json
1919
import os
@@ -111,7 +111,7 @@ def setup_and_run_server():
111111
env_prefill["CUDA_VISIBLE_DEVICES"] = "0"
112112
env_prefill["ENABLE_V1_KVCACHE_SCHEDULER"] = "1"
113113
env_prefill["FD_LOG_DIR"] = "log_prefill"
114-
prefill_log_path = "server.log"
114+
prefill_log_path = "prefill.log"
115115
prefill_cmd = [
116116
sys.executable,
117117
"-m",
@@ -161,7 +161,7 @@ def setup_and_run_server():
161161
env_decode["CUDA_VISIBLE_DEVICES"] = "1"
162162
env_decode["ENABLE_V1_KVCACHE_SCHEDULER"] = "1"
163163
env_decode["FD_LOG_DIR"] = "log_decode"
164-
decode_log_path = "decode_server.log"
164+
decode_log_path = "decode.log"
165165
decode_cmd = [
166166
sys.executable,
167167
"-m",
@@ -216,7 +216,7 @@ def setup_and_run_server():
216216
try:
217217
os.killpg(process_prefill.pid, signal.SIGTERM)
218218
os.killpg(process_decode.pid, signal.SIGTERM)
219-
clean_ports()
219+
clean_ports(PORTS_TO_CLEAN)
220220
except Exception as e:
221221
print(f"Failed to kill process group: {e}")
222222
raise RuntimeError(f"API server did not start on port {FD_API_PORT}")

0 commit comments

Comments
 (0)