From 1d4a35a23cc45de215878584242402a23c6ce42e Mon Sep 17 00:00:00 2001 From: Vaibhav Srivastav Date: Fri, 9 Aug 2024 15:01:34 +0200 Subject: [PATCH] Update documentation for Supported models (#2386) * Minor doc fixes * up. * Other minor updates. --- docs/source/conceptual/quantization.md | 4 ++-- docs/source/quicktour.md | 2 +- docs/source/supported_models.md | 8 ++++---- server/text_generation_server/models/__init__.py | 6 +++--- update_doc.py | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md index 7507687f0ac..a1ebe7e7da8 100644 --- a/docs/source/conceptual/quantization.md +++ b/docs/source/conceptual/quantization.md @@ -11,7 +11,7 @@ We recommend using the official quantization scripts for creating your quants: For on-the-fly quantization you simply need to pass one of the supported quantization types and TGI takes care of the rest. -## Quantization with bitsandbytes +## Quantization with bitsandbytes, EETQ & fp8 bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing – weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision. @@ -32,7 +32,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes). -Use `eetq` or `fp8` for other quantization schemes. +Similarly you can use pass you can pass `--quantize eetq` or `--quantize fp8` for respective quantization schemes. In addition to this, TGI allows creating GPTQ quants directly by passing the model weights and a calibration dataset. diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md index 2313c69b70a..18e1a107903 100644 --- a/docs/source/quicktour.md +++ b/docs/source/quicktour.md @@ -21,7 +21,7 @@ TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPU ## Consuming TGI -Once TGI is running, you can use the `generate` endpoint by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint. +Once TGI is running, you can use the `generate` endpoint or the Open AI Chat Completion API compatible [Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint. diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md index b78104dfd9c..832f88ef7ea 100644 --- a/docs/source/supported_models.md +++ b/docs/source/supported_models.md @@ -1,22 +1,22 @@ # Supported Models and Hardware -Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models are hardware are supported. +Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models (VLMs & LLMs) are supported. ## Supported Models - [Deepseek V2](https://huggingface.co/deepseek-ai/DeepSeek-V2) - [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal) - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal) -- [Llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) +- [Llama](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f) - [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) - [Gemma](https://huggingface.co/google/gemma-7b) - [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224) -- [Gemma2](https://huggingface.co/google/gemma2-9b) +- [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315) - [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus) - [Dbrx](https://huggingface.co/databricks/dbrx-instruct) - [Mamba](https://huggingface.co/state-spaces/mamba-2.8b-slimpj) -- [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) +- [Mistral](https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407) - [Mixtral](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) - [Gpt Bigcode](https://huggingface.co/bigcode/gpt_bigcode-santacoder) - [Phi](https://huggingface.co/microsoft/phi-1_5) diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index da14d083f66..960b426b38f 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -180,7 +180,7 @@ class ModelType(enum.Enum): LLAMA = { "type": "llama", "name": "Llama", - "url": "https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct", + "url": "https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f", } PHI3 = { "type": "phi3", @@ -200,7 +200,7 @@ class ModelType(enum.Enum): GEMMA2 = { "type": "gemma2", "name": "Gemma2", - "url": "https://huggingface.co/google/gemma2-9b", + "url": "https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315", } COHERE = { "type": "cohere", @@ -220,7 +220,7 @@ class ModelType(enum.Enum): MISTRAL = { "type": "mistral", "name": "Mistral", - "url": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2", + "url": "https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407", } MIXTRAL = { "type": "mixtral", diff --git a/update_doc.py b/update_doc.py index 428d445211e..e887e1c6dc0 100644 --- a/update_doc.py +++ b/update_doc.py @@ -7,7 +7,7 @@ TEMPLATE = """ # Supported Models and Hardware -Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models are hardware are supported. +Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models (VLMs & LLMs) are supported. ## Supported Models