From ec6d4592d59c136ea1d2cac3b4e2a7ea004a0ce3 Mon Sep 17 00:00:00 2001 From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Date: Mon, 11 Dec 2023 16:46:44 +0100 Subject: [PATCH] v1.3.1 --- Cargo.lock | 8 ++++---- Cargo.toml | 2 +- docs/openapi.json | 2 +- integration-tests/pyproject.toml | 2 +- server/pyproject.toml | 2 +- .../models/custom_modeling/flash_mistral_modeling.py | 5 ++++- .../models/custom_modeling/flash_mixtral_modeling.py | 7 +++++-- 7 files changed, 17 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 105aefe19e0..6ae066010ee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2754,7 +2754,7 @@ dependencies = [ [[package]] name = "text-generation-benchmark" -version = "1.3.0" +version = "1.3.1" dependencies = [ "average", "clap", @@ -2775,7 +2775,7 @@ dependencies = [ [[package]] name = "text-generation-client" -version = "1.3.0" +version = "1.3.1" dependencies = [ "futures", "grpc-metadata", @@ -2791,7 +2791,7 @@ dependencies = [ [[package]] name = "text-generation-launcher" -version = "1.3.0" +version = "1.3.1" dependencies = [ "clap", "ctrlc", @@ -2807,7 +2807,7 @@ dependencies = [ [[package]] name = "text-generation-router" -version = "1.3.0" +version = "1.3.1" dependencies = [ "async-stream", "axum", diff --git a/Cargo.toml b/Cargo.toml index c45b11ebc4c..b22fccf89fb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ members = [ ] [workspace.package] -version = "1.3.0" +version = "1.3.1" edition = "2021" authors = ["Olivier Dehaene"] homepage = "https://github.com/huggingface/text-generation-inference" diff --git a/docs/openapi.json b/docs/openapi.json index 153630c0219..6c372148bfe 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -10,7 +10,7 @@ "name": "Apache 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0" }, - "version": "1.3.0" + "version": "1.3.1" }, "paths": { "/": { diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml index 9457efbcc5c..ae5876b0fba 100644 --- a/integration-tests/pyproject.toml +++ b/integration-tests/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "text-generation-integration-tests" -version = "1.3.0" +version = "1.3.1" description = "Text Generation Inference integration tests" authors = ["Nicolas Patry "] diff --git a/server/pyproject.toml b/server/pyproject.toml index 8ca1a5c58d9..b2aa4dc3e97 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "text-generation-server" -version = "1.3.0" +version = "1.3.1" description = "Text Generation Inference Python gRPC Server" authors = ["Olivier Dehaene "] diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py index 525bf6bc0a8..5a4f5be0631 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py @@ -391,6 +391,7 @@ def forward( slots: torch.Tensor, input_lengths: torch.Tensor, max_s: int, + true_max_s: int, prefill_cache_indices: Optional[torch.Tensor], ) -> torch.Tensor: hidden_states = self.embed_tokens(input_ids) @@ -398,7 +399,7 @@ def forward( # Get rotary cos and sin for this forward # Avoid to index in each layer cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin( - position_ids, max_s, hidden_states.dtype + position_ids, true_max_s, hidden_states.dtype ) residual = None @@ -449,6 +450,7 @@ def forward( prefill_cache_indices: Optional[torch.Tensor], lm_head_indices: Optional[torch.Tensor] = None, ) -> torch.Tensor: + true_max_s = max_s if prefill_cache_indices is not None: # Slots also need to be sliced as it has the same size as the whole kv tensor slots = slots[prefill_cache_indices] @@ -467,6 +469,7 @@ def forward( slots, input_lengths, max_s, + true_max_s, prefill_cache_indices, ) if lm_head_indices is not None: diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py index 6f5edca2377..76ebc6b8f67 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py @@ -401,7 +401,7 @@ def topology(self, x: torch.Tensor, padded_bins: torch.Tensor): self.offsets_block_rows = block_rows offsets = self.offsets else: - offsets = self.offsets[:block_rows] + offsets = self.offsets[: block_rows + 1] # Indices for the sparse matrix. The indices for # the intermediate matrix are dynamic depending @@ -632,6 +632,7 @@ def forward( slots: torch.Tensor, input_lengths: torch.Tensor, max_s: int, + true_max_s: int, prefill_cache_indices: Optional[torch.Tensor], ) -> torch.Tensor: hidden_states = self.embed_tokens(input_ids) @@ -639,7 +640,7 @@ def forward( # Get rotary cos and sin for this forward # Avoid to index in each layer cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin( - position_ids, max_s, hidden_states.dtype + position_ids, true_max_s, hidden_states.dtype ) residual = None @@ -690,6 +691,7 @@ def forward( prefill_cache_indices: Optional[torch.Tensor], lm_head_indices: Optional[torch.Tensor] = None, ) -> torch.Tensor: + true_max_s = max_s if prefill_cache_indices is not None: # Slots also need to be sliced as it has the same size as the whole kv tensor slots = slots[prefill_cache_indices] @@ -708,6 +710,7 @@ def forward( slots, input_lengths, max_s, + true_max_s, prefill_cache_indices, ) if lm_head_indices is not None: