diff --git a/.gitignore b/.gitignore index d9dec081714ec..a050864d9ddd0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,15 +1,172 @@ -**/*.pyc -**/__pycache__/ -*.egg-info/ -*.eggs/ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions *.so -*.log -*.csv + +# Distribution / packaging +.Python build/ -docs/build/ +develop-eggs/ dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST -*.pkl -*.png -**/log.txt +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +# VSCode .vscode/ + +# DS Store +.DS_Store + +# Results +*.csv + +# Python pickle files +*.pkl diff --git a/README.md b/README.md index 2652422c36cfb..67fd8900d013a 100644 --- a/README.md +++ b/README.md @@ -1,50 +1,84 @@ -# vLLM: Easy, Fast, and Cheap LLM Serving for Everyone +

+ + + vLLM + +

-| [**Documentation**](https://llm-serving-cacheflow.readthedocs-hosted.com/_/sharing/Cyo52MQgyoAWRQ79XA4iA2k8euwzzmjY?next=/en/latest/) | [**Blog**]() | +

+Easy, fast, and cheap LLM serving for everyone +

-vLLM is a fast and easy-to-use library for LLM inference and serving. +

+| Documentation | Blog | -## Latest News 🔥 +

-- [2023/06] We officially released vLLM! vLLM has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid April. Check out our [blog post](). +--- -## Getting Started +*Latest News* 🔥 -Visit our [documentation](https://llm-serving-cacheflow.readthedocs-hosted.com/_/sharing/Cyo52MQgyoAWRQ79XA4iA2k8euwzzmjY?next=/en/latest/) to get started. -- [Installation](https://llm-serving-cacheflow.readthedocs-hosted.com/_/sharing/Cyo52MQgyoAWRQ79XA4iA2k8euwzzmjY?next=/en/latest/getting_started/installation.html): `pip install vllm` -- [Quickstart](https://llm-serving-cacheflow.readthedocs-hosted.com/_/sharing/Cyo52MQgyoAWRQ79XA4iA2k8euwzzmjY?next=/en/latest/getting_started/quickstart.html) -- [Supported Models](https://llm-serving-cacheflow.readthedocs-hosted.com/_/sharing/Cyo52MQgyoAWRQ79XA4iA2k8euwzzmjY?next=/en/latest/models/supported_models.html) +- [2023/06] We officially released vLLM! vLLM has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid April. Check out our [blog post](). + +--- -## Key Features +vLLM is a fast and easy to use library for LLM inference and serving. -vLLM comes with many powerful features that include: +vLLM is fast with: -- State-of-the-art performance in serving throughput +- State-of-the-art serving throughput - Efficient management of attention key and value memory with **PagedAttention** -- Seamless integration with popular HuggingFace models - Dynamic batching of incoming requests - Optimized CUDA kernels -- High-throughput serving with various decoding algorithms, including *parallel sampling* and *beam search* + +vLLM is flexible and easy to use with: + +- Seamless integration with popular HuggingFace models +- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more - Tensor parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server +Install vLLM with pip or [from source](https://llm-serving-cacheflow.readthedocs-hosted.com/en/latest/getting_started/installation.html#build-from-source): + +```bash +pip install vllm +``` + +## Getting Started + +Visit our [documentation](https://llm-serving-cacheflow.readthedocs-hosted.com/_/sharing/Cyo52MQgyoAWRQ79XA4iA2k8euwzzmjY?next=/en/latest/) to get started. +- [Installation](https://llm-serving-cacheflow.readthedocs-hosted.com/_/sharing/Cyo52MQgyoAWRQ79XA4iA2k8euwzzmjY?next=/en/latest/getting_started/installation.html) +- [Quickstart](https://llm-serving-cacheflow.readthedocs-hosted.com/_/sharing/Cyo52MQgyoAWRQ79XA4iA2k8euwzzmjY?next=/en/latest/getting_started/quickstart.html) +- [Supported Models](https://llm-serving-cacheflow.readthedocs-hosted.com/_/sharing/Cyo52MQgyoAWRQ79XA4iA2k8euwzzmjY?next=/en/latest/models/supported_models.html) + ## Performance vLLM outperforms HuggingFace Transformers (HF) by up to 24x and Text Generation Inference (TGI) by up to 3.5x, in terms of throughput. For details, check out our [blog post]().

- - + + + + + + + +
Serving throughput when each request asks for 1 output completion.

- - -
+ + + + + + + +
Serving throughput when each request asks for 3 output completions.

diff --git a/assets/figures/perf_a100_n1_dark.png b/docs/source/assets/figures/perf_a100_n1_dark.png similarity index 100% rename from assets/figures/perf_a100_n1_dark.png rename to docs/source/assets/figures/perf_a100_n1_dark.png diff --git a/assets/figures/perf_a100_n1_light.png b/docs/source/assets/figures/perf_a100_n1_light.png similarity index 100% rename from assets/figures/perf_a100_n1_light.png rename to docs/source/assets/figures/perf_a100_n1_light.png diff --git a/assets/figures/perf_a100_n3_dark.png b/docs/source/assets/figures/perf_a100_n3_dark.png similarity index 100% rename from assets/figures/perf_a100_n3_dark.png rename to docs/source/assets/figures/perf_a100_n3_dark.png diff --git a/assets/figures/perf_a100_n3_light.png b/docs/source/assets/figures/perf_a100_n3_light.png similarity index 100% rename from assets/figures/perf_a100_n3_light.png rename to docs/source/assets/figures/perf_a100_n3_light.png diff --git a/assets/figures/perf_a10g_n1_dark.png b/docs/source/assets/figures/perf_a10g_n1_dark.png similarity index 100% rename from assets/figures/perf_a10g_n1_dark.png rename to docs/source/assets/figures/perf_a10g_n1_dark.png diff --git a/assets/figures/perf_a10g_n1_light.png b/docs/source/assets/figures/perf_a10g_n1_light.png similarity index 100% rename from assets/figures/perf_a10g_n1_light.png rename to docs/source/assets/figures/perf_a10g_n1_light.png diff --git a/assets/figures/perf_a10g_n3_dark.png b/docs/source/assets/figures/perf_a10g_n3_dark.png similarity index 100% rename from assets/figures/perf_a10g_n3_dark.png rename to docs/source/assets/figures/perf_a10g_n3_dark.png diff --git a/assets/figures/perf_a10g_n3_light.png b/docs/source/assets/figures/perf_a10g_n3_light.png similarity index 100% rename from assets/figures/perf_a10g_n3_light.png rename to docs/source/assets/figures/perf_a10g_n3_light.png diff --git a/docs/source/assets/logos/vllm-logo-only-light.png b/docs/source/assets/logos/vllm-logo-only-light.png new file mode 100644 index 0000000000000..7aaf174872594 Binary files /dev/null and b/docs/source/assets/logos/vllm-logo-only-light.png differ diff --git a/docs/source/assets/logos/vllm-logo-text-dark.png b/docs/source/assets/logos/vllm-logo-text-dark.png new file mode 100644 index 0000000000000..959a42fd36c72 Binary files /dev/null and b/docs/source/assets/logos/vllm-logo-text-dark.png differ diff --git a/docs/source/assets/logos/vllm-logo-text-light.png b/docs/source/assets/logos/vllm-logo-text-light.png new file mode 100644 index 0000000000000..1ead9972879c2 Binary files /dev/null and b/docs/source/assets/logos/vllm-logo-text-light.png differ diff --git a/docs/source/index.rst b/docs/source/index.rst index ecb32f482000b..ddcf5772d625e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,18 +1,43 @@ Welcome to vLLM! ================ -**vLLM** is a fast and easy-to-use library for LLM inference and serving. -Its core features include: - -- State-of-the-art performance in serving throughput -- Efficient management of attention key and value memory with **PagedAttention** -- Seamless integration with popular HuggingFace models -- Dynamic batching of incoming requests -- Optimized CUDA kernels -- High-throughput serving with various decoding algorithms, including *parallel sampling* and *beam search* -- Tensor parallelism support for distributed inference -- Streaming outputs -- OpenAI-compatible API server +.. figure:: ./assets/logos/vllm-logo-text-light.png + :width: 60% + :align: center + :alt: vLLM + :class: no-scaled-link + +.. raw:: html + +

+ Easy, fast, and cheap LLM serving for everyone + +

+ +

+ Star + Watch + Fork +

+ + + +vLLM is a fast and easy to use library for LLM inference and serving. + +vLLM is fast with: + +* State-of-the-art serving throughput +* Efficient management of attention key and value memory with **PagedAttention** +* Dynamic batching of incoming requests +* Optimized CUDA kernels + +vLLM is flexible and easy to use with: + +* Seamless integration with popular HuggingFace models +* High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more +* Tensor parallelism support for distributed inference +* Streaming outputs +* OpenAI-compatible API server For more information, please refer to our `blog post <>`_.