Skip to content

Commit

Permalink
Merge branch 'main' into multi-modal
Browse files Browse the repository at this point in the history
  • Loading branch information
sywangyi committed Jul 22, 2024
2 parents 47d0863 + f3435ba commit c590413
Show file tree
Hide file tree
Showing 72 changed files with 4,915 additions and 599 deletions.
27 changes: 18 additions & 9 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ jobs:
concurrency:
group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
# TODO see with @Glegendre to get CPU runner here instead
runs-on: [self-hosted, intel-cpu, 32-cpu, 256-ram, ci]
runs-on:
group: aws-r7i-8xlarge-priv
permissions:
contents: write
packages: write
Expand All @@ -49,7 +49,7 @@ jobs:
export dockerfile="Dockerfile"
export label_extension=""
export docker_devices=""
export runs_on="nvidia-gpu"
export runs_on="aws-g5-12xlarge"
;;
rocm)
export dockerfile="Dockerfile_amd"
Expand Down Expand Up @@ -79,9 +79,15 @@ jobs:
uses: docker/setup-buildx-action@v3
with:
install: true
config-inline: |
buildkitd-config-inline: |
[registry."docker.io"]
mirrors = ["registry.github-runners.huggingface.tech"]
mirrors = ["registry-us-east-1-mirror.prod.aws.ci.huggingface.tech"]
- name: Login to internal Container Registry
uses: docker/login-action@v3
with:
username: ${{ secrets.REGISTRY_USERNAME }}
password: ${{ secrets.REGISTRY_PASSWORD }}
registry: registry.internal.huggingface.tech
- name: Login to GitHub Container Registry
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
Expand All @@ -103,7 +109,8 @@ jobs:
uses: docker/metadata-action@v5
with:
images: |
registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference
registry-us-east-1.prod.aws.ci.huggingface.tech/api-inference/community/text-generation-inference
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
tags: |
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
# If main, release or tag
Expand All @@ -115,7 +122,8 @@ jobs:
flavor: |
latest=auto
images: |
registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference
registry-us-east-1.prod.aws.ci.huggingface.tech/api-inference/community/text-generation-inference
registry.internal.huggingface.tech/api-inference/community/text-generation-inferenceca
ghcr.io/huggingface/text-generation-inference
db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
tags: |
Expand All @@ -141,7 +149,7 @@ jobs:
- name: Final
id: final
run: |
echo "docker_image=registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
echo "docker_image=registry-us-east-1.prod.aws.ci.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
echo "label=${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
Expand All @@ -150,7 +158,8 @@ jobs:
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
needs: build-and-push
runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"]
runs-on:
group: ${{ needs.build-and-push.outputs.runs_on }}
if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
env:
PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '' }}
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/load_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ jobs:
concurrency:
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
runs-on:
group: aws-g5-12xlarge
env:
DOCKER_VOLUME: /cache
steps:
Expand Down
53 changes: 47 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 14 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,17 @@ COPY server/custom_kernels/ .
# Build specific version of transformers
RUN python setup.py build

# Build FBGEMM CUDA kernels
FROM kernel-builder AS fbgemm-builder

WORKDIR /usr/src

COPY server/Makefile-fbgemm Makefile
COPY server/fbgemm_remove_unused.patch fbgemm_remove_unused.patch
COPY server/fix_torch90a.sh fix_torch90a.sh

RUN make build-fbgemm

# Build vllm CUDA kernels
FROM kernel-builder AS vllm-builder

Expand Down Expand Up @@ -225,10 +236,10 @@ COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-31
# Copy build artifacts from marlin kernels builder
COPY --from=marlin-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

# Copy builds artifacts from vllm builder
# Copy build artifacts from fbgemm builder
COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.10/cmake-install /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

# Copy build artifacts from mamba builder
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
Expand Down
15 changes: 14 additions & 1 deletion docs/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -909,7 +909,7 @@
"tool_choice": {
"allOf": [
{
"$ref": "#/components/schemas/ToolType"
"$ref": "#/components/schemas/ToolChoice"
}
],
"nullable": true
Expand Down Expand Up @@ -2035,6 +2035,14 @@
}
}
},
"ToolChoice": {
"allOf": [
{
"$ref": "#/components/schemas/ToolType"
}
],
"nullable": true
},
"ToolType": {
"oneOf": [
{
Expand All @@ -2055,6 +2063,11 @@
"$ref": "#/components/schemas/FunctionName"
}
}
},
{
"type": "object",
"default": null,
"nullable": true
}
]
},
Expand Down
2 changes: 2 additions & 0 deletions docs/source/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
title: Messages API
- local: architecture
title: Internal Architecture
- local: usage_statistics
title: Usage Statistics
title: Getting started
- sections:
- local: basic_tutorials/consuming_tgi
Expand Down
16 changes: 16 additions & 0 deletions docs/source/basic_tutorials/launcher.md
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,22 @@ Options:
[env: LORA_ADAPTERS=]
```
## DISABLE_USAGE_STATS
```shell
--disable-usage-stats
Disable sending of all usage statistics
[env: DISABLE_USAGE_STATS=]
```
## DISABLE_CRASH_REPORTS
```shell
--disable-crash-reports
Disable sending of crash reports, but allow anonymous usage statistics
[env: DISABLE_CRASH_REPORTS=]
```
## HELP
```shell
Expand Down
1 change: 1 addition & 0 deletions docs/source/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Text Generation Inference enables serving optimized models on specific hardware

## Supported Models

- [Deepseek V2](https://huggingface.co/deepseek-ai/DeepSeek-V2)
- [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
- [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
- [Llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
Expand Down
73 changes: 73 additions & 0 deletions docs/source/usage_statistics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@

# Collection of Usage Statistics

Text Generation Inference collects anonymous usage statistics to help us improve the service. The collected data is used to improve TGI and to understand what causes failures. The data is collected transparently and any sensitive information is omitted.

Data is sent twice, once on server startup and once when server stops. Also, usage statistics are only enabled when TGI is running in docker to avoid collecting data then TGI runs directly on the host machine.

## What data is collected

The code that collects the data is available [here](https://github.com/huggingface/text-generation-inference/blob/main/router/src/usage_stats.rs).
As of release 2.1.2 this is an example of the data collected:

- From the TGI configuration:
```json
{
"event_type": "start",
"disable_grammar_support": false,
"max_batch_prefill_tokens": 4096,
"max_batch_size": null,
"max_batch_total_tokens": null,
"max_best_of": 2,
"max_client_batch_size": 4,
"max_concurrent_requests": 128,
"max_input_tokens": 1024,
"max_stop_sequences": 4,
"max_top_n_tokens": 5,
"max_total_tokens": 2048,
"max_waiting_tokens": 20,
"messages_api_enabled": false,
"model_config": {
"model_type": "Bloom"
},
"revision": null,
"tokenizer_class": "BloomTokenizerFast",
"validation_workers": 2,
"waiting_served_ratio": 1.2,
"docker_label": "latest",
"git_sha": "cfc118704880453d29bcbe4fbbd91dda501cf5fe",
"nvidia_env": {
"name": "NVIDIA A10G",
"pci_bus_id": "00000000:00:1E.0",
"driver_version": "535.183.01",
"pstate": "P8",
"pcie_link_gen_max": "4",
"pcie_link_gen_current": "1",
"temperature_gpu": "31",
"utilization_gpu": "0 %",
"utilization_memory": "0 %",
"memory_total": "23028 MiB",
"memory_free": "22515 MiB",
"memory_used": "0 MiB",
"reset_status_reset_required": "No",
"reset_status_drain_and_reset_recommended": "No",
"compute_cap": "8.6",
"ecc_errors_corrected_volatile_total": "0",
"mig_mode_current": "[N/A]",
"power_draw_instant": "10.86 W",
"power_limit": "300.00 W"
},
"system_env": {
"cpu_count": 16,
"cpu_type": "AMD EPYC 7R32",
"total_memory": 66681196544,
"architecture": "x86_64",
"platform": "linux-unix-x86_64"
}
}

```

## How to opt-out

You can easily opt out by passing the `--disable-usage-stats` to the text-generation-launcher command. This will disable all usage statistics. You can also pass `--disable-crash-reports` which disables sending specific crash reports, but allows anonymous usage statistics.
Loading

0 comments on commit c590413

Please sign in to comment.