File tree Expand file tree Collapse file tree 4 files changed +37
-5
lines changed Expand file tree Collapse file tree 4 files changed +37
-5
lines changed Original file line number Diff line number Diff line change @@ -12,8 +12,10 @@ trap remove_docker_container EXIT
12
12
remove_docker_container
13
13
14
14
# Run the image
15
- docker run -itd -v ~ /.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
16
- docker run -itd -v ~ /.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
15
+ docker run -itd --entrypoint /bin/bash -v ~ /.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
16
+ --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
17
+ docker run -itd --entrypoint /bin/bash -v ~ /.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
18
+ --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
17
19
18
20
# offline inference
19
21
docker exec cpu-test bash -c " python3 examples/offline_inference.py"
Original file line number Diff line number Diff line change @@ -6,7 +6,13 @@ RUN apt-get update -y \
6
6
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
7
7
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
8
8
9
- RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc
9
+ # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
10
+ # intel-openmp provides additional performance improvement vs. openmp
11
+ # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
12
+ RUN pip install intel-openmp
13
+
14
+ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
15
+
10
16
11
17
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
12
18
@@ -31,4 +37,4 @@ WORKDIR /workspace/
31
37
32
38
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
33
39
34
- CMD ["/bin/bash "]
40
+ ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server "]
Original file line number Diff line number Diff line change @@ -398,6 +398,27 @@ def update_environment_variables(envs: Dict[str, str]):
398
398
os .environ [k ] = v
399
399
400
400
401
+ def init_kmp_env ():
402
+ if not is_cpu ():
403
+ return
404
+
405
+ ld_prealod_str = os .getenv ("LD_PRELOAD" , "" )
406
+ if "libiomp5.so" not in ld_prealod_str :
407
+ return
408
+
409
+ # The time(milliseconds) that a thread should wait after completing the
410
+ # execution of a parallel region, before sleeping.
411
+ os .environ ['KMP_BLOCKTIME' ] = "1"
412
+ # dump settings on start up
413
+ os .environ ['KMP_SETTINGS' ] = "1"
414
+ # Prevents the CPU to run into low performance state
415
+ os .environ ['KMP_TPAUSE' ] = "0"
416
+ # Provides fine granularity parallelism
417
+ os .environ ['KMP_FORKJOIN_BARRIER_PATTERN' ] = "dist,dist"
418
+ os .environ ['KMP_PLAIN_BARRIER_PATTERN' ] = "dist,dist"
419
+ os .environ ['KMP_REDUCTION_BARRIER_PATTERN' ] = "dist,dist"
420
+
421
+
401
422
def chunk_list (lst : List [T ], chunk_size : int ) -> List [List [T ]]:
402
423
"""Yield successive chunk_size chunks from lst."""
403
424
return [lst [i :i + chunk_size ] for i in range (0 , len (lst ), chunk_size )]
Original file line number Diff line number Diff line change 13
13
from vllm .logger import init_logger
14
14
from vllm .model_executor import set_random_seed
15
15
from vllm .sequence import ExecuteModelRequest
16
- from vllm .utils import STR_DTYPE_TO_TORCH_DTYPE
16
+ from vllm .utils import STR_DTYPE_TO_TORCH_DTYPE , init_kmp_env
17
17
from vllm .worker .cpu_model_runner import CPUModelRunner
18
18
from vllm .worker .worker_base import (LocalOrDistributedWorkerBase ,
19
19
LoraNotSupportedWorkerBase , WorkerInput )
@@ -150,6 +150,9 @@ def __init__(
150
150
if self .is_driver_worker :
151
151
assert self .rank == 0 , "The driver worker must have rank 0."
152
152
153
+ # try to initialize intel openmp optimized tunings
154
+ init_kmp_env ()
155
+
153
156
if self .model_config .trust_remote_code :
154
157
# note: lazy import to avoid importing torch before initializing
155
158
from vllm .utils import init_cached_hf_modules
You can’t perform that action at this time.
0 commit comments