Skip to content

Commit fa03b74

Browse files
committed
[Security] Prevent new imports of (cloud)pickle
This change proposes an additional pre-commit check that bans new imports of `pickle` or `cloudpickle` unless they are added to an allowlist in `tools/check_pickle_imports.py`. The reasoning is that these modules are known to be unsafe when deserializing data from potentially untrusted parties. It has resulted in multiple CVEs for vLLM, and numerous in the python ecosystem more broadly. There are safer alternatives available. Two relevant alternatives already in use in vLLM are `msgpack` and `pydantic`. To help prevent accidental future CVEs, we should do what we can to discourage any further use of the unsafe choices. This check is not perfect. It is still possible to add `pickle` imports using `importlib`, for example. Diligence is still required on the part of code reviewers to watch out for unsafe additions to the code base. Signed-off-by: Russell Bryant <rbryant@redhat.com>
1 parent 2f1c19b commit fa03b74

File tree

2 files changed

+159
-0
lines changed

2 files changed

+159
-0
lines changed

.pre-commit-config.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,13 @@ repos:
143143
types: [python]
144144
pass_filenames: false
145145
additional_dependencies: [regex]
146+
- id: check-pickle-imports
147+
name: Prevent new pickle/cloudpickle imports
148+
entry: python tools/check_pickle_imports.py
149+
language: python
150+
types: [python]
151+
pass_filenames: false
152+
additional_dependencies: [pathspec, regex]
146153
# Keep `suggestion` last
147154
- id: suggestion
148155
name: Suggestion

tools/check_pickle_imports.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
#!/usr/bin/env python3
2+
# SPDX-License-Identifier: Apache-2.0
3+
import os
4+
import sys
5+
6+
import regex as re
7+
8+
try:
9+
import pathspec
10+
except ImportError:
11+
print(
12+
"ERROR: The 'pathspec' library is required. "
13+
"Install it with 'pip install pathspec'.",
14+
file=sys.stderr)
15+
sys.exit(2)
16+
17+
# List of files (relative to repo root) that are allowed to import pickle or
18+
# cloudpickle
19+
#
20+
# STOP AND READ BEFORE YOU ADD ANYTHING ELSE TO THIS LIST:
21+
# The pickle and cloudpickle modules are known to be unsafe when deserializing
22+
# data from potentially untrusted parties. They have resulted in multiple CVEs
23+
# for vLLM and numerous vulnerabilities in the Python ecosystem more broadly.
24+
# Before adding new uses of pickle/cloudpickle, please consider safer
25+
# alternatives like msgpack or pydantic that are already in use in vLLM. Only
26+
# add to this list if absolutely necessary and after careful security review.
27+
ALLOWED_FILES = set([
28+
# pickle
29+
'vllm/utils.py',
30+
'vllm/v1/serial_utils.py',
31+
'vllm/v1/executor/multiproc_executor.py',
32+
'vllm/multimodal/hasher.py',
33+
'vllm/transformers_utils/config.py',
34+
'vllm/model_executor/models/registry.py',
35+
'tests/test_utils.py',
36+
'tests/tokenization/test_cached_tokenizer.py',
37+
'tests/model_executor/test_guided_processors.py',
38+
'vllm/distributed/utils.py',
39+
'vllm/distributed/parallel_state.py',
40+
'vllm/engine/multiprocessing/client.py',
41+
'vllm/distributed/device_communicators/custom_all_reduce_utils.py',
42+
'vllm/distributed/device_communicators/shm_broadcast.py',
43+
'vllm/engine/multiprocessing/engine.py',
44+
'benchmarks/kernels/graph_machete_bench.py',
45+
'benchmarks/kernels/benchmark_lora.py',
46+
'benchmarks/kernels/benchmark_machete.py',
47+
'benchmarks/fused_kernels/layernorm_rms_benchmarks.py',
48+
'benchmarks/cutlass_benchmarks/w8a8_benchmarks.py',
49+
'benchmarks/cutlass_benchmarks/sparse_benchmarks.py',
50+
# cloudpickle
51+
'vllm/worker/worker_base.py',
52+
'vllm/executor/mp_distributed_executor.py',
53+
'vllm/executor/ray_distributed_executor.py',
54+
'vllm/entrypoints/llm.py',
55+
'tests/utils.py',
56+
# pickle and cloudpickle
57+
'vllm/utils.py',
58+
'vllm/v1/serial_utils.py',
59+
'vllm/v1/executor/multiproc_executor.py',
60+
'vllm/transformers_utils/config.py',
61+
'vllm/model_executor/models/registry.py',
62+
'vllm/engine/multiprocessing/client.py',
63+
'vllm/engine/multiprocessing/engine.py',
64+
])
65+
66+
PICKLE_RE = re.compile(r"^\s*(import\s+(pickle|cloudpickle)(\s|$|\sas)"
67+
r"|from\s+(pickle|cloudpickle)\s+import\b)")
68+
69+
70+
def is_python_file(path):
71+
return path.endswith('.py')
72+
73+
74+
def scan_file(path):
75+
with open(path, encoding='utf-8') as f:
76+
for line in f:
77+
if PICKLE_RE.match(line):
78+
return True
79+
return False
80+
81+
82+
def load_gitignore(repo_root):
83+
gitignore_path = os.path.join(repo_root, '.gitignore')
84+
patterns = []
85+
if os.path.exists(gitignore_path):
86+
with open(gitignore_path, encoding='utf-8') as f:
87+
patterns = f.read().splitlines()
88+
# Always ignore .git directory
89+
patterns.append('.git/')
90+
return pathspec.PathSpec.from_lines('gitwildmatch', patterns)
91+
92+
93+
def main():
94+
repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
95+
spec = load_gitignore(repo_root)
96+
bad_files = []
97+
for dirpath, _, filenames in os.walk(repo_root):
98+
for filename in filenames:
99+
if not is_python_file(filename):
100+
continue
101+
abs_path = os.path.join(dirpath, filename)
102+
rel_path = os.path.relpath(abs_path, repo_root)
103+
# Skip ignored files
104+
if spec.match_file(rel_path):
105+
continue
106+
if scan_file(abs_path) and rel_path not in ALLOWED_FILES:
107+
bad_files.append(rel_path)
108+
if bad_files:
109+
print("\nERROR: The following files import 'pickle' or 'cloudpickle' "
110+
"but are not in the allowed list:")
111+
for f in bad_files:
112+
print(f" {f}")
113+
print("\nIf this is intentional, update the allowed list in "
114+
"tools/check_pickle_imports.py.")
115+
sys.exit(1)
116+
sys.exit(0)
117+
118+
119+
def test_regex():
120+
test_cases = [
121+
# Should match
122+
("import pickle", True),
123+
("import cloudpickle", True),
124+
("import pickle as pkl", True),
125+
("import cloudpickle as cpkl", True),
126+
("from pickle import *", True),
127+
("from cloudpickle import dumps", True),
128+
("from pickle import dumps, loads", True),
129+
("from cloudpickle import (dumps, loads)", True),
130+
(" import pickle", True),
131+
("\timport cloudpickle", True),
132+
("from pickle import loads", True),
133+
# Should not match
134+
("import somethingelse", False),
135+
("from somethingelse import pickle", False),
136+
("# import pickle", False),
137+
("print('import pickle')", False),
138+
("import pickleas as asdf", False),
139+
]
140+
for i, (line, should_match) in enumerate(test_cases):
141+
result = bool(PICKLE_RE.match(line))
142+
assert result == should_match, (
143+
f"Test case {i} failed: '{line}' "
144+
f"(expected {should_match}, got {result})")
145+
print("All regex tests passed.")
146+
147+
148+
if __name__ == '__main__':
149+
if '--test-regex' in sys.argv:
150+
test_regex()
151+
else:
152+
main()

0 commit comments

Comments
 (0)