Merge branch 'main' into multi-modal

huggingface · Jul 22, 2024 · c590413 · c590413
2 parents 47d0863 + f3435ba
commit c590413
Show file tree

Hide file tree

Showing 72 changed files with 4,915 additions and 599 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -27,8 +27,8 @@ jobs:
     concurrency:
       group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
-    # TODO see with @Glegendre to get CPU runner here instead
-    runs-on: [self-hosted, intel-cpu, 32-cpu, 256-ram, ci]
+    runs-on:
+      group: aws-r7i-8xlarge-priv
     permissions:
       contents: write
       packages: write
@@ -49,7 +49,7 @@ jobs:
                 export dockerfile="Dockerfile"
                 export label_extension=""
                 export docker_devices=""
-                export runs_on="nvidia-gpu"
+                export runs_on="aws-g5-12xlarge"
                 ;;
             rocm)
                 export dockerfile="Dockerfile_amd"
@@ -79,9 +79,15 @@ jobs:
         uses: docker/setup-buildx-action@v3
         with:
           install: true
-          config-inline: |
+          buildkitd-config-inline: |
             [registry."docker.io"]
-              mirrors = ["registry.github-runners.huggingface.tech"]
+              mirrors = ["registry-us-east-1-mirror.prod.aws.ci.huggingface.tech"]
+      - name: Login to internal Container Registry
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.REGISTRY_USERNAME }}
+          password: ${{ secrets.REGISTRY_PASSWORD }}
+          registry: registry.internal.huggingface.tech
       - name: Login to GitHub Container Registry
         if: github.event_name != 'pull_request'
         uses: docker/login-action@v3
@@ -103,7 +109,8 @@ jobs:
         uses: docker/metadata-action@v5
         with:
           images: |
-            registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference
+            registry-us-east-1.prod.aws.ci.huggingface.tech/api-inference/community/text-generation-inference
+            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
           tags: |
             type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
       # If main, release or tag
@@ -115,7 +122,8 @@ jobs:
           flavor: |
             latest=auto
           images: |
-            registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference
+            registry-us-east-1.prod.aws.ci.huggingface.tech/api-inference/community/text-generation-inference
+            registry.internal.huggingface.tech/api-inference/community/text-generation-inferenceca
             ghcr.io/huggingface/text-generation-inference
             db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
           tags: |
@@ -141,7 +149,7 @@ jobs:
       - name: Final
         id: final
         run: |
-          echo "docker_image=registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
+          echo "docker_image=registry-us-east-1.prod.aws.ci.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
           echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
           echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
           echo "label=${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
@@ -150,7 +158,8 @@ jobs:
       group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
     needs: build-and-push
-    runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"]
+    runs-on:
+      group: ${{ needs.build-and-push.outputs.runs_on }}
     if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
     env:
       PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '' }}

diff --git a/.github/workflows/load_test.yaml b/.github/workflows/load_test.yaml
@@ -15,7 +15,8 @@ jobs:
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
-    runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
+    runs-on:
+      group: aws-g5-12xlarge
     env:
       DOCKER_VOLUME: /cache
     steps:

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Dockerfile b/Dockerfile
@@ -161,6 +161,17 @@ COPY server/custom_kernels/ .
 # Build specific version of transformers
 RUN python setup.py build
 
+# Build FBGEMM CUDA kernels
+FROM kernel-builder AS fbgemm-builder
+
+WORKDIR /usr/src
+
+COPY server/Makefile-fbgemm Makefile
+COPY server/fbgemm_remove_unused.patch fbgemm_remove_unused.patch
+COPY server/fix_torch90a.sh fix_torch90a.sh
+
+RUN make build-fbgemm
+
 # Build vllm CUDA kernels
 FROM kernel-builder AS vllm-builder
 
@@ -225,10 +236,10 @@ COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-31
 # Copy build artifacts from marlin kernels builder
 COPY --from=marlin-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-
-# Copy builds artifacts from vllm builder
+# Copy build artifacts from fbgemm builder
+COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.10/cmake-install /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from vllm builder
 COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-
 # Copy build artifacts from mamba builder
 COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
 COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages

diff --git a/docs/openapi.json b/docs/openapi.json
@@ -909,7 +909,7 @@
           "tool_choice": {
             "allOf": [
               {
-                "$ref": "#/components/schemas/ToolType"
+                "$ref": "#/components/schemas/ToolChoice"
               }
             ],
             "nullable": true
@@ -2035,6 +2035,14 @@
           }
         }
       },
+      "ToolChoice": {
+        "allOf": [
+          {
+            "$ref": "#/components/schemas/ToolType"
+          }
+        ],
+        "nullable": true
+      },
       "ToolType": {
         "oneOf": [
           {
@@ -2055,6 +2063,11 @@
                 "$ref": "#/components/schemas/FunctionName"
               }
             }
+          },
+          {
+            "type": "object",
+            "default": null,
+            "nullable": true
           }
         ]
       },

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -21,6 +21,8 @@
     title: Messages API
   - local: architecture
     title: Internal Architecture
+  - local: usage_statistics
+    title: Usage Statistics
   title: Getting started
 - sections:
   - local: basic_tutorials/consuming_tgi

diff --git a/docs/source/basic_tutorials/launcher.md b/docs/source/basic_tutorials/launcher.md
@@ -424,6 +424,22 @@ Options:
           
           [env: LORA_ADAPTERS=]
 
+```
+## DISABLE_USAGE_STATS
+```shell
+      --disable-usage-stats
+          Disable sending of all usage statistics
+          
+          [env: DISABLE_USAGE_STATS=]
+
+```
+## DISABLE_CRASH_REPORTS
+```shell
+      --disable-crash-reports
+          Disable sending of crash reports, but allow anonymous usage statistics
+          
+          [env: DISABLE_CRASH_REPORTS=]
+
 ```
 ## HELP
 ```shell

diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
@@ -5,6 +5,7 @@ Text Generation Inference enables serving optimized models on specific hardware
 
 ## Supported Models
 
+- [Deepseek V2](https://huggingface.co/deepseek-ai/DeepSeek-V2)
 - [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
 - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
 - [Llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)

diff --git a/docs/source/usage_statistics.md b/docs/source/usage_statistics.md
@@ -0,0 +1,73 @@
+
+# Collection of Usage Statistics
+
+Text Generation Inference collects anonymous usage statistics to help us improve the service. The collected data is used to improve TGI and to understand what causes failures. The data is collected transparently and any sensitive information is omitted.
+
+Data is sent twice, once on server startup and once when server stops. Also, usage statistics are only enabled when TGI is running in docker to avoid collecting data then TGI runs directly on the host machine.
+
+## What data is collected
+
+The code that collects the data is available [here](https://github.com/huggingface/text-generation-inference/blob/main/router/src/usage_stats.rs).
+As of release 2.1.2 this is an example of the data collected:
+
+- From the TGI configuration:
+```json
+{
+  "event_type": "start",
+  "disable_grammar_support": false,
+  "max_batch_prefill_tokens": 4096,
+  "max_batch_size": null,
+  "max_batch_total_tokens": null,
+  "max_best_of": 2,
+  "max_client_batch_size": 4,
+  "max_concurrent_requests": 128,
+  "max_input_tokens": 1024,
+  "max_stop_sequences": 4,
+  "max_top_n_tokens": 5,
+  "max_total_tokens": 2048,
+  "max_waiting_tokens": 20,
+  "messages_api_enabled": false,
+  "model_config": {
+    "model_type": "Bloom"
+  },
+  "revision": null,
+  "tokenizer_class": "BloomTokenizerFast",
+  "validation_workers": 2,
+  "waiting_served_ratio": 1.2,
+  "docker_label": "latest",
+  "git_sha": "cfc118704880453d29bcbe4fbbd91dda501cf5fe",
+  "nvidia_env": {
+    "name": "NVIDIA A10G",
+    "pci_bus_id": "00000000:00:1E.0",
+    "driver_version": "535.183.01",
+    "pstate": "P8",
+    "pcie_link_gen_max": "4",
+    "pcie_link_gen_current": "1",
+    "temperature_gpu": "31",
+    "utilization_gpu": "0 %",
+    "utilization_memory": "0 %",
+    "memory_total": "23028 MiB",
+    "memory_free": "22515 MiB",
+    "memory_used": "0 MiB",
+    "reset_status_reset_required": "No",
+    "reset_status_drain_and_reset_recommended": "No",
+    "compute_cap": "8.6",
+    "ecc_errors_corrected_volatile_total": "0",
+    "mig_mode_current": "[N/A]",
+    "power_draw_instant": "10.86 W",
+    "power_limit": "300.00 W"
+  },
+  "system_env": {
+    "cpu_count": 16,
+    "cpu_type": "AMD EPYC 7R32",
+    "total_memory": 66681196544,
+    "architecture": "x86_64",
+    "platform": "linux-unix-x86_64"
+  }
+}
+
+```
+
+## How to opt-out
+
+You can easily opt out by passing the `--disable-usage-stats` to the text-generation-launcher command. This will disable all usage statistics. You can also pass `--disable-crash-reports` which disables sending specific crash reports, but allows anonymous usage statistics.