[2026-05-18 12:01:19] Generated KT GPU experts masks using 'uniform' strategy: 43 MoE layers (out of 43 total layers) x 256 experts, total GPU experts in MoE layers = 430
[2026-05-18 12:01:19] [KT] Created shared staging buffer: 16.0 MiB (shape=torch.Size([2048, 4096]), dtype=torch.bfloat16)
self.load_model()
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/sglang/srt/model_executor/model_runner.py", line 1032, in load_model
self.model = self.loader.load_model(
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/sglang/srt/model_loader/loader.py", line 671, in load_model
model = _initialize_model(
^^^^^^^^^^^^^^^^^^
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/sglang/srt/model_loader/loader.py", line 277, in _initialize_model
return model_class(**kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/sglang/srt/models/deepseek_v4.py", line 1437, in __init__
self.model = DeepseekV4Model(
^^^^^^^^^^^^^^^^
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/sglang/srt/models/deepseek_v4.py", line 1266, in __init__
self.layers, self.start_layer, self.end_layer = make_layers(
^^^^^^^^^^^^
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/sglang/srt/utils/common.py", line 648, in make_layers
+ get_offloader().wrap_modules(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/sglang/srt/utils/offloader.py", line 36, in wrap_modules
return list(all_modules_generator)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/sglang/srt/utils/common.py", line 650, in <genexpr>
layer_fn(idx=idx, prefix=add_prefix(idx, prefix))
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/sglang/srt/models/deepseek_v4.py", line 1268, in <lambda>
lambda idx, prefix: DeepseekV4DecoderLayer(
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/sglang/srt/models/deepseek_v4.py", line 1014, in __init__
self.mlp = _V4MoE(
^^^^^^^
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/sglang/srt/models/deepseek_v2.py", line 493, in __init__
self.experts = get_moe_impl_class(quant_config)(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/sglang/srt/layers/moe/fused_moe_triton/layer.py", line 291, in __init__
self.quant_method.create_weights(
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/sglang/srt/layers/moe/kt_ep_wrapper.py", line 2311, in create_weights
self.wrapper = KTMoEWrapper(
^^^^^^^^^^^^^
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/kt_kernel/experts.py", line 207, in __new__
return _create_inference_wrapper(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/aigao/miniconda3/envs/kt-kernel/lib/python3.12/site-packages/kt_kernel/experts.py", line 353, in _create_inference_wrapper
raise ValueError(
ValueError: swiglu_limit=10.0 is only supported on method='MXFP4', got method='AMXINT4' (backend=AMXMoEWrapper). This usually means SGLANG_DSV4_2604_SUBMODE=2604B is set in the environment while the current launch does not actually use MXFP4 weights — either unset the env or pass --kt-method MXFP4.
export FLASHINFER_CUDA_ARCH_LIST=12.0a
export TORCH_CUDA_ARCH_LIST="8.9+PTX"
export SGLANG_DSV4_MODE=2604
export SGLANG_DSV4_2604_SUBMODE=2604B
numactl --interleave=all python -m sglang.launch_server \
--served-model-name DeepSeek-V4-Flash \
--host 0.0.0.0 --port 30000 \
--model /mnt/nvme0/ai-models/DeepSeek-V4-Flash/ \
--kt-weight-path /home/aigao/DeepSeek-V4-Flash-AMXINT4 \
--kt-method AMXINT4 \
--kt-num-gpu-experts 10 \
--kt-cpuinfer 35 \
--kt-threadpool-count 2 \
--kt-gpu-prefill-token-threshold 4096 \
--kt-enable-dynamic-expert-update \
--tensor-parallel-size 1 \
--context-length 32768 \
--attention-backend flashinfer \
--mem-fraction-static 0.85 \
--chunked-prefill-size 2048 \
--max-prefill-tokens 2048 \
--max-running-requests 2 \
--watchdog-timeout 1200 \
--disable-shared-experts-fusion \
--trust-remote-code \
--cuda-graph-bs 1 \
--cuda-graph-max-bs 1 \
--disable-radix-cache \
--skip-server-warmup
Reminder
System Info
4090 48G
6416h with AMX
Reproduction
Others