Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
fi

# Install vllm wheel first, so that torch etc will be installed.
# !bang
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system dist/*.whl --verbose \
Expand Down
24 changes: 16 additions & 8 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ aiohttp==3.10.11
aiohttp-cors==0.8.1
# via ray
aiosignal==1.3.1
# via
# aiohttp
# ray
# via aiohttp
albucore==0.0.16
# via terratorch
albumentations==1.4.6
Expand Down Expand Up @@ -139,7 +137,7 @@ contourpy==1.3.0
# via matplotlib
cramjam==2.9.0
# via fastparquet
cupy-cuda12x==13.3.0
cupy-cuda12x==13.5.1
# via ray
cycler==0.12.1
# via matplotlib
Expand Down Expand Up @@ -226,7 +224,6 @@ frozenlist==1.5.0
# via
# aiohttp
# aiosignal
# ray
fsspec==2024.9.0
# via
# datasets
Expand Down Expand Up @@ -603,10 +600,18 @@ opencv-python-headless==4.11.0.86
opentelemetry-api==1.35.0
# via
# mlflow-skinny
# opentelemetry-exporter-prometheus
# opentelemetry-sdk
# opentelemetry-semantic-conventions
opentelemetry-exporter-prometheus==0.56b0
# via ray
opentelemetry-proto==1.36.0
# via ray
opentelemetry-sdk==1.35.0
# via mlflow-skinny
# via
# mlflow-skinny
# opentelemetry-exporter-prometheus
# ray
opentelemetry-semantic-conventions==0.56b0
# via opentelemetry-sdk
packaging==24.2
Expand Down Expand Up @@ -697,7 +702,9 @@ pqdm==0.2.0
pretrainedmodels==0.7.4
# via segmentation-models-pytorch
prometheus-client==0.22.0
# via ray
# via
# opentelemetry-exporter-prometheus
# ray
propcache==0.2.0
# via yarl
proto-plus==1.26.1
Expand All @@ -707,6 +714,7 @@ protobuf==5.28.3
# google-api-core
# googleapis-common-protos
# mlflow-skinny
# opentelemetry-proto
# proto-plus
# ray
# tensorboardx
Expand Down Expand Up @@ -854,7 +862,7 @@ rasterio==1.4.3
# rioxarray
# terratorch
# torchgeo
ray==2.43.0
ray==2.48.0
# via -r requirements/test.in
redis==5.2.0
# via tensorizer
Expand Down
203 changes: 87 additions & 116 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,10 +282,69 @@ def run(self):
self.copy_file(file, dst_file)


class repackage_wheel(build_ext):
class precompiled_wheel_utils:
"""Extracts libraries and other files from an existing wheel."""

def get_base_commit_in_main_branch(self) -> str:
@staticmethod
def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
import tempfile
import zipfile

temp_dir = None
try:
if not os.path.isfile(wheel_url_or_path):
wheel_filename = wheel_url_or_path.split("/")[-1]
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_url_or_path} "
f"to {wheel_path}")
from urllib.request import urlretrieve
urlretrieve(wheel_url_or_path, filename=wheel_path)
else:
wheel_path = wheel_url_or_path
print(f"Using existing wheel at {wheel_path}")

package_data_patch = {}

with zipfile.ZipFile(wheel_path) as wheel:
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so",
]

compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
file_members = list(
filter(lambda x: x.filename in files_to_copy,
wheel.filelist))
file_members += list(
filter(lambda x: compiled_regex.match(x.filename),
wheel.filelist))

for file in file_members:
print(f"[extract] {file.filename}")
target_path = os.path.join(".", file.filename)
os.makedirs(os.path.dirname(target_path), exist_ok=True)
with wheel.open(file.filename) as src, open(
target_path, "wb") as dst:
shutil.copyfileobj(src, dst)

pkg = os.path.dirname(file.filename).replace("/", ".")
package_data_patch.setdefault(pkg, []).append(
os.path.basename(file.filename))

return package_data_patch
finally:
if temp_dir is not None:
print(f"Removing temporary directory {temp_dir}")
shutil.rmtree(temp_dir)

@staticmethod
def get_base_commit_in_main_branch() -> str:
# Force to use the nightly wheel. This is mainly used for CI testing.
if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
return "nightly"
Expand Down Expand Up @@ -334,115 +393,6 @@ def get_base_commit_in_main_branch(self) -> str:
"wheel may not be compatible with your dev branch: %s", err)
return "nightly"

def run(self) -> None:
assert _is_cuda(
), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"

wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
if wheel_location is None:
base_commit = self.get_base_commit_in_main_branch()
wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
# Fallback to nightly wheel if latest commit wheel is unavailable,
# in this rare case, the nightly release CI hasn't finished on main.
if not is_url_available(wheel_location):
wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"

import zipfile

if os.path.isfile(wheel_location):
wheel_path = wheel_location
print(f"Using existing wheel={wheel_path}")
else:
# Download the wheel from a given URL, assume
# the filename is the last part of the URL
wheel_filename = wheel_location.split("/")[-1]

import tempfile

# create a temporary directory to store the wheel
temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
wheel_path = os.path.join(temp_dir, wheel_filename)
print(f"Downloading wheel from {wheel_location} to {wheel_path}")
from urllib.request import urlretrieve
try:
urlretrieve(wheel_location, filename=wheel_path)
except Exception as e:
from setuptools.errors import SetupError
raise SetupError(
f"Failed to get vLLM wheel from {wheel_location}") from e

# Set the dist_dir for Docker build context
dist_dir = ("/workspace/dist"
if envs.VLLM_DOCKER_BUILD_CONTEXT else "dist")
os.makedirs(dist_dir, exist_ok=True)

# Extract only necessary compiled .so files from precompiled wheel
with zipfile.ZipFile(wheel_path) as wheel:
# Get version from METADATA (optional, mostly useful for logging)
metadata_file = next((n for n in wheel.namelist()
if n.endswith(".dist-info/METADATA")), None)
if not metadata_file:
raise RuntimeError(
"Could not find METADATA in precompiled wheel.")
metadata = wheel.read(metadata_file).decode()
version_line = next((line for line in metadata.splitlines()
if line.startswith("Version: ")), None)
if not version_line:
raise RuntimeError(
"Could not determine version from METADATA.")
version = version_line.split(": ")[1].strip()

print(f"Extracting precompiled kernels from vLLM wheel version: "
f"{version}")

# List of compiled shared objects to extract
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so",
]

file_members = list(
filter(lambda x: x.filename in files_to_copy, wheel.filelist))
compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
file_members += list(
filter(lambda x: compiled_regex.match(x.filename),
wheel.filelist))

for file in file_members:
print(f"Extracting and including {file.filename} "
"from existing wheel")
package_name = os.path.dirname(file.filename).replace("/", ".")
file_name = os.path.basename(file.filename)

if package_name not in package_data:
package_data[package_name] = []

output_base = (dist_dir
if envs.VLLM_DOCKER_BUILD_CONTEXT else ".")
target_path = os.path.join(output_base, file.filename)
os.makedirs(os.path.dirname(target_path), exist_ok=True)
with wheel.open(file.filename) as src, open(target_path,
"wb") as dst:
shutil.copyfileobj(src, dst)

package_data[package_name].append(file_name)

# Copy wheel into dist dir for Docker to consume (e.g., via --mount)
if envs.VLLM_DOCKER_BUILD_CONTEXT:
arch_tag = "cp38-abi3-manylinux1_x86_64"
corrected_wheel_name = f"vllm-{version}-{arch_tag}.whl"
final_wheel_path = os.path.join(dist_dir, corrected_wheel_name)

print(
"Docker build context detected, copying precompiled wheel to "
f"{final_wheel_path}")
shutil.copy2(wheel_path, final_wheel_path)


def _no_device() -> bool:
return VLLM_TARGET_DEVICE == "empty"
Expand Down Expand Up @@ -676,16 +626,37 @@ def _read_requirements(filename: str) -> list[str]:
]
}

# If using precompiled, extract and patch package_data (in advance of setup)
if envs.VLLM_USE_PRECOMPILED:
assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
if wheel_location is not None:
wheel_url = wheel_location
else:
base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
from urllib.request import urlopen
try:
with urlopen(wheel_url) as resp:
if resp.status != 200:
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
except Exception as e:
print(f"[warn] Falling back to nightly wheel: {e}")
wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"

patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
wheel_url)
for pkg, files in patch.items():
package_data.setdefault(pkg, []).extend(files)

if _no_device():
ext_modules = []

if not ext_modules:
if not ext_modules or envs.VLLM_USE_PRECOMPILED:
# Disable build_ext when using precompiled wheel
cmdclass = {}
else:
cmdclass = {
"build_ext":
repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
}
cmdclass = {"build_ext": cmake_build_ext}

setup(
# static metadata should rather go in pyproject.toml
Expand Down