Skip to content

Making make install work better by default. #2004

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ jobs:
~/.cargo/git
- name: Install
run: |
make install
make install-cpu
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simple server tests are running on non accelerated hardwared, therefore cannot install the kernels.

- name: Run server tests
run: |
pip install pytest
Expand Down
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ tokenizers = { version = "0.19.1", features = ["http"] }
hf-hub = { version = "0.3.1", features = ["tokio"] }

[profile.release]
incremental = true
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This speeds up devs by improving significantly build times.


[profile.release-binary]
inherits = "release"
debug = 1
incremental = true
lto = "fat"
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

# Copy build artifacts from flash attention v2 builder
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pip installing means the library is in this location instead.


# Copy build artifacts from custom kernels builder
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
Expand Down
17 changes: 10 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
install-server:
cd server && make install

install-custom-kernels:
if [ "$$BUILD_EXTENSIONS" = "True" ]; then cd server/custom_kernels && python setup.py install; else echo "Custom kernels are disabled, you need to set the BUILD_EXTENSIONS environment variable to 'True' in order to build them. (Please read the docs, kernels might not work on all hardware)"; fi

install-integration-tests:
cd integration-tests && pip install -r requirements.txt
cd clients/python && pip install .
install-server-cpu:
cd server && make install-server
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BUILD_EXTENSIONS is removed (it's not used anymore anyway, only for bloom and it's not even guaranteed to work on all hardware)


install-router:
cd router && cargo install --path .
Expand All @@ -17,7 +13,10 @@ install-launcher:
install-benchmark:
cd benchmark && cargo install --path .

install: install-server install-router install-launcher install-custom-kernels
install: install-server install-router install-launcher


install-cpu: install-server-cpu install-router install-launcher

server-dev:
cd server && make run-dev
Expand All @@ -28,6 +27,10 @@ router-dev:
rust-tests: install-router install-launcher
cargo test

install-integration-tests:
cd integration-tests && pip install -r requirements.txt
cd clients/python && pip install .

integration-tests: install-integration-tests
pytest -s -vv -m "not private" integration-tests

Expand Down
6 changes: 5 additions & 1 deletion router/client/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
.out_dir("src/pb")
.include_file("mod.rs")
.compile_with_config(config, &["../../proto/generate.proto"], &["../../proto"])
.unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
.map_err(|e| match e.kind(){
std::io::ErrorKind::NotFound => {panic!("`protoc` not found, install libprotoc")},
std::io::ErrorKind::Other => {panic!("`protoc` version unsupported, upgrade protoc: https://github.com/protocolbuffers/protobuf/releases")},
e => {e}
}).unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));

Ok(())
}
12 changes: 10 additions & 2 deletions server/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,26 @@ unit-tests:

gen-server:
# Compile protos
pip install grpcio-tools==1.51.1 mypy-protobuf==3.4.0 'types-protobuf>=3.20.4' --no-cache-dir
pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In line with pyproject.toml

mkdir text_generation_server/pb || true
python -m grpc_tools.protoc -I../proto --python_out=text_generation_server/pb \
--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/generate.proto
find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
touch text_generation_server/pb/__init__.py

install: gen-server
install-server: gen-server
pip install pip --upgrade
pip install -r requirements_cuda.txt
pip install -e ".[bnb, accelerate, quantize, peft, outlines]"


install: install-cuda
echo "Installed server"

install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention

install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm

run-dev:
SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded

Expand Down
22 changes: 10 additions & 12 deletions server/Makefile-flash-att
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
flash_att_commit := 3a9bfd076f98746c73362328958dbc68d145fbec

flash-attention:
# Clone flash attention
pip install -U packaging ninja --no-cache-dir
git clone https://github.com/HazyResearch/flash-attention.git

build-flash-attention: flash-attention
cd flash-attention && git fetch && git checkout $(flash_att_commit)
cd flash-attention && python setup.py build
cd flash-attention/csrc/rotary && python setup.py build
cd flash-attention/csrc/layer_norm && python setup.py build
build-flash-attention:
if [ ! -d 'flash-attention' ]; then \
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very dumb protection for recompiling all the time, at least it doesn't fail when you rerun the command.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keeing build- variants only because of the dockerfiles.

pip install -U packaging ninja --no-cache-dir && \
git clone https://github.com/HazyResearch/flash-attention.git && \
cd flash-attention && git fetch && git checkout $(flash_att_commit) && \
MAX_JOBS=8 python setup.py build && cd csrc/layer_norm && python setup.py build && cd ../rotary && python setup.py build; \
fi

install-flash-attention: build-flash-attention
pip uninstall flash_attn rotary_emb dropout_layer_norm -y || true
cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
if [ ! -d 'flash-attention' ]; then \
cd flash-attntion && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install; \
fi
41 changes: 18 additions & 23 deletions server/Makefile-flash-att-v2
Original file line number Diff line number Diff line change
@@ -1,29 +1,24 @@
flash_att_v2_commit_cuda := v2.5.8
flash_att_v2_commit_cuda := v2.5.9.post1
flash_att_v2_commit_rocm := 2554f490101742ccdc56620a938f847f61754be6

build-flash-attention-v2-cuda:
pip install -U packaging wheel
pip install flash-attn==$(flash_att_v2_commit_cuda)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wheel is necessary and flash-attn doesn't respect pep517 (Dao-AILab/flash-attention#453) so nothing really better is doable I feel


flash-attention-v2-cuda:
# Clone flash attention
pip install -U packaging ninja --no-cache-dir
git clone https://github.com/Dao-AILab/flash-attention.git flash-attention-v2
install-flash-attention-v2-cuda:
pip install -U packaging wheel
pip install flash-attn==$(flash_att_v2_commit_cuda)

build-flash-attention-v2-cuda: flash-attention-v2-cuda
cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_cuda)
cd flash-attention-v2 && git submodule update --init --recursive
cd flash-attention-v2 && python setup.py build

install-flash-attention-v2-cuda: build-flash-attention-v2-cuda
cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install

flash-attention-v2-rocm:
# Clone flash attention
pip install -U packaging ninja --no-cache-dir
git clone https://github.com/ROCm/flash-attention.git flash-attention-v2

build-flash-attention-v2-rocm: flash-attention-v2-rocm
cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm)
cd flash-attention-v2 && git submodule update --init --recursive
cd flash-attention-v2 && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build
build-flash-attention-v2-rocm:
if [ ! -d 'flash-attention-v2' ]; then \
pip install -U packaging ninja --no-cache-dir && \
git clone https://github.com/ROCm/flash-attention.git flash-attention-v2 && \
cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm) && \
git submodule update --init --recursive && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build; \
fi

install-flash-attention-v2-rocm: build-flash-attention-v2-rocm
cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install
if [ ! -d 'flash-attention-v2' ]; then \
cd flash-attention-v2 && \
GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install; \
fi
43 changes: 22 additions & 21 deletions server/Makefile-vllm
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
vllm-cuda:
# Clone vllm
pip install -U ninja packaging --no-cache-dir
git clone https://github.com/Narsil/vllm.git vllm

build-vllm-cuda: vllm-cuda
cd vllm && git fetch && git checkout b5dfc61db88a81069e45b44f7cc99bd9e62a60fa
cd vllm && python setup.py build

build-vllm-cuda:
if [ ! -d 'vllm' ]; then \
pip install -U ninja packaging --no-cache-dir && \
git clone https://github.com/Narsil/vllm.git vllm &&\
cd vllm && \
git fetch && git checkout b5dfc61db88a81069e45b44f7cc99bd9e62a60fa &&\
python setup.py build; \
fi
install-vllm-cuda: build-vllm-cuda
pip uninstall vllm -y || true
cd vllm && python setup.py install

vllm-rocm:
# Clone vllm
pip install -U ninja packaging --no-cache-dir
git clone https://github.com/fxmarty/rocm-vllm.git vllm
if [ ! -d 'vllm' ]; then \
cd vllm && pip install -e .; \
fi

build-vllm-rocm: vllm-rocm
cd vllm && git fetch && git checkout ca6913b3c2ffacdcb7d15e914dc34adbc6c89479
cd vllm && PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install
build-vllm-rocm:
if [ ! -d 'vllm' ]; then \
pip install -U ninja packaging --no-cache-dir && \
git clone https://github.com/fxmarty/rocm-vllm.git vllm && \
cd vllm && git fetch && git checkout ca6913b3c2ffacdcb7d15e914dc34adbc6c89479 && \
PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build; \
fi

install-vllm-rocm: build-vllm-rocm
pip uninstall vllm -y || true
cd vllm && python setup.py install
if [ ! -d 'vllm' ]; then \
cd vllm && \
PYTORCH_ROCM_ARCH="gfx90a;gfx942" pip install -e .; \
fi
Loading
Loading