From a7834ceb0feef6132ed83b27167da0f5e9633a0a Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Fri, 6 Sep 2024 10:22:00 +0100 Subject: [PATCH] Docs: add more cross-references to the KV cache docs (#33323) * add more cross-references * nit * import guard * more import guards * nit * Update src/transformers/generation/configuration_utils.py --- docs/source/en/kv_cache.md | 20 +++++----- docs/source/en/llm_optims.md | 2 +- docs/source/en/llm_tutorial_optimization.md | 2 +- .../generation/configuration_utils.py | 37 +++++++++++++++++-- src/transformers/generation/utils.py | 23 +++--------- .../models/bloom/modeling_bloom.py | 3 +- .../models/codegen/modeling_codegen.py | 3 +- .../models/cohere/modeling_cohere.py | 3 +- src/transformers/models/dbrx/modeling_dbrx.py | 3 +- .../models/falcon/modeling_falcon.py | 3 +- .../models/gemma/modeling_gemma.py | 3 +- .../models/gemma2/modeling_gemma2.py | 3 +- src/transformers/models/git/modeling_git.py | 3 +- .../models/gpt_neo/modeling_gpt_neo.py | 3 +- .../models/gpt_neox/modeling_gpt_neox.py | 3 +- src/transformers/models/gptj/modeling_gptj.py | 3 +- .../models/granite/modeling_granite.py | 3 +- .../models/llama/modeling_llama.py | 3 +- .../models/mistral/modeling_mistral.py | 3 +- .../models/mistral/modeling_tf_mistral.py | 3 +- .../models/nemotron/modeling_nemotron.py | 3 +- src/transformers/models/olmo/modeling_olmo.py | 3 +- .../models/persimmon/modeling_persimmon.py | 3 +- src/transformers/models/phi/modeling_phi.py | 3 +- src/transformers/models/phi3/modeling_phi3.py | 3 +- .../models/qwen2/modeling_qwen2.py | 3 +- .../models/qwen2_moe/modeling_qwen2_moe.py | 3 +- .../models/stablelm/modeling_stablelm.py | 3 +- .../models/starcoder2/modeling_starcoder2.py | 3 +- 29 files changed, 99 insertions(+), 57 deletions(-) diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md index be566437a34704..1a9ea1ac001907 100644 --- a/docs/source/en/kv_cache.md +++ b/docs/source/en/kv_cache.md @@ -51,11 +51,11 @@ More concretely, key-value cache acts as a memory bank for these generative mode See an example below for how to implement your own generation loop. - + ```python >>> import torch >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache - + >>> model_id = "meta-llama/Llama-2-7b-chat-hf" >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0") >>> tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -69,10 +69,10 @@ More concretely, key-value cache acts as a memory bank for these generative mode >>> max_new_tokens = 10 >>> for _ in range(max_new_tokens): - ... outputs = model(**inputs, cache_position=cache_position, past_key_values=past_key_values, use_cache=True) + ... outputs = model(**inputs, cache_position=cache_position, past_key_values=past_key_values, use_cache=True) ... # Greedily sample one next token ... next_token_ids = outputs.logits[:, -1:].argmax(-1) - ... generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1) + ... generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1) ... ... # Prepare inputs for the next generation step by leaaving unprocessed tokens, in our case we have only one new token ... # and expanding attn mask for the new token, as explained above @@ -222,7 +222,7 @@ before successfully generating 40 beams. ### Static Cache -Since the "DynamicCache" dynamically grows with each generation step, it prevents you from taking advantage of JIT optimizations. The [`~StaticCache`] pre-allocates +Since the "DynamicCache" dynamically grows with each generation step, it prevents you from taking advantage of JIT optimizations. The [`~StaticCache`] pre-allocates a specific maximum size for the keys and values, allowing you to generate up to the maximum length without having to modify cache size. Check the below usage example. For more examples with Static Cache and JIT compilation, take a look at [StaticCache & torchcompile](./llm_optims#static-kv-cache-and-torchcompile) @@ -267,7 +267,7 @@ This will use the [`~OffloadedStaticCache`] implementation instead. As the name suggests, this cache type implements a sliding window over previous keys and values, retaining only the last `sliding_window` tokens. It should be used with models like Mistral that support sliding window attention. Additionally, similar to Static Cache, this one is JIT-friendly and can be used with the same compile tecniques as Static Cache. -Note that you can use this cache only for models that support sliding window, e.g. Mistral models. +Note that you can use this cache only for models that support sliding window, e.g. Mistral models. ```python @@ -324,7 +324,7 @@ We have seen how to use each of the cache types when generating. What if you wan The general format when doing iterative generation is as below. First you have to initialize an empty cache of the type you want, and you can start feeding in new prompts iteratively. Keeping track of dialogues history and formatting can be done with chat templates, read more on that in [chat_templating](./chat_templating) -In case you are using Sink Cache, you have to crop your inputs to that maximum length because Sink Cache can generate text longer than its maximum window size, but it expects the first input to not exceed the maximum cache length. +In case you are using Sink Cache, you have to crop your inputs to that maximum length because Sink Cache can generate text longer than its maximum window size, but it expects the first input to not exceed the maximum cache length. ```python @@ -354,9 +354,9 @@ In case you are using Sink Cache, you have to crop your inputs to that maximum l ... inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device) ... if isinstance(past_key_values, SinkCache): ... inputs = {k: v[:, -max_cache_length:] for k, v in inputs.items()} -... +... ... input_length = inputs["input_ids"].shape[1] -... +... ... outputs = model.generate(**inputs, do_sample=False, max_new_tokens=256, past_key_values=past_key_values) ... completion = tokenizer.decode(outputs[0, input_length: ], skip_special_tokens=True) ... messages.append({"role": "assistant", "content": completion}) @@ -400,4 +400,4 @@ Sometimes you would want to first fill-in cache object with key/values for certa >>> print(responses) [' You are a helpful assistant. Help me to write a blogpost about travelling.\n\nTitle: The Ultimate Guide to Travelling: Tips, Tricks, and', ' You are a helpful assistant. What is the capital of France?\n\nYes, the capital of France is Paris.'] -``` \ No newline at end of file +``` diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md index 881cd6cd754e2a..16be638498dfd4 100644 --- a/docs/source/en/llm_optims.md +++ b/docs/source/en/llm_optims.md @@ -24,7 +24,7 @@ This guide will show you how to use the optimization techniques available in Tra During decoding, a LLM computes the key-value (kv) values for each input token and since it is autoregressive, it computes the same kv values each time because the generated output becomes part of the input now. This is not very efficient because you're recomputing the same kv values each time. -To optimize this, you can use a kv-cache to store the past keys and values instead of recomputing them each time. However, since the kv-cache grows with each generation step and is dynamic, it prevents you from taking advantage of [`torch.compile`](./perf_torch_compile), a powerful optimization tool that fuses PyTorch code into fast and optimized kernels. +To optimize this, you can use a kv-cache to store the past keys and values instead of recomputing them each time. However, since the kv-cache grows with each generation step and is dynamic, it prevents you from taking advantage of [`torch.compile`](./perf_torch_compile), a powerful optimization tool that fuses PyTorch code into fast and optimized kernels. We have an entire guide dedicated to kv-caches [here](./kv_cache). The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with `torch.compile` for up to a 4x speed up. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware. diff --git a/docs/source/en/llm_tutorial_optimization.md b/docs/source/en/llm_tutorial_optimization.md index 23086929f6d54a..a675a6de39a2fc 100644 --- a/docs/source/en/llm_tutorial_optimization.md +++ b/docs/source/en/llm_tutorial_optimization.md @@ -662,7 +662,7 @@ Using the key-value cache has two advantages: - Significant increase in computational efficiency as less computations are performed compared to computing the full \\( \mathbf{QK}^T \\) matrix. This leads to an increase in inference speed - The maximum required memory is not increased quadratically with the number of generated tokens, but only increases linearly. -> One should *always* make use of the key-value cache as it leads to identical results and a significant speed-up for longer input sequences. Transformers has the key-value cache enabled by default when making use of the text pipeline or the [`generate` method](https://huggingface.co/docs/transformers/main_classes/text_generation). +> One should *always* make use of the key-value cache as it leads to identical results and a significant speed-up for longer input sequences. Transformers has the key-value cache enabled by default when making use of the text pipeline or the [`generate` method](https://huggingface.co/docs/transformers/main_classes/text_generation). We have an entire guide dedicated to caches [here](./kv_cache). diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index df4d028bede541..773ef0ccfe55eb 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -43,11 +43,34 @@ logger = logging.get_logger(__name__) METADATA_FIELDS = ("_from_model_config", "_commit_hash", "_original_object_hash", "transformers_version") NEEDS_CACHE_CONFIG = {} +NEED_SETUP_CACHE_CLASSES_MAPPING = {} +QUANT_BACKEND_CLASSES_MAPPING = {} +ALL_CACHE_IMPLEMENTATIONS = [] if is_torch_available(): - from ..cache_utils import QuantizedCacheConfig + from ..cache_utils import ( + HQQQuantizedCache, + HybridCache, + MambaCache, + OffloadedStaticCache, + QuantizedCacheConfig, + QuantoQuantizedCache, + SlidingWindowCache, + StaticCache, + ) NEEDS_CACHE_CONFIG["quantized"] = QuantizedCacheConfig + NEED_SETUP_CACHE_CLASSES_MAPPING = { + "static": StaticCache, + "offloaded_static": OffloadedStaticCache, + "sliding_window": SlidingWindowCache, + "hybrid": HybridCache, + "mamba": MambaCache, + } + QUANT_BACKEND_CLASSES_MAPPING = {"quanto": QuantoQuantizedCache, "HQQ": HQQQuantizedCache} + ALL_CACHE_IMPLEMENTATIONS = list(NEED_SETUP_CACHE_CLASSES_MAPPING.keys()) + list( + QUANT_BACKEND_CLASSES_MAPPING.keys() + ) class GenerationMode(ExplicitEnum): @@ -70,7 +93,7 @@ class GenerationMode(ExplicitEnum): class GenerationConfig(PushToHubMixin): # no-format - r""" + rf""" Class that holds a configuration for a generation task. A `generate` call supports the following generation methods for text-decoder, text-to-text, speech-to-text, and vision-to-text models: @@ -146,7 +169,10 @@ class GenerationConfig(PushToHubMixin): Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding. cache_implementation (`str`, *optional*, default to `None`): - Cache class that should be used when generating. + Name of the cache class that will be instantiated in `generate`, for faster decoding. Possible values are: + {ALL_CACHE_IMPLEMENTATIONS}. We support other cache types, but they must be manually instantiated and + passed to `generate` through the `past_key_values` argument. See our + [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information. cache_config (`CacheConfig` or `dict`, *optional*, default to `None`): Arguments used in the key-value cache class can be passed in `cache_config`. Can be passed as a `Dict` and it will be converted to its repsective `CacheConfig` internally. @@ -699,6 +725,11 @@ def validate(self, is_init=False): ) # 5. check cache-related arguments + if self.cache_implementation is not None and self.cache_implementation not in ALL_CACHE_IMPLEMENTATIONS: + raise ValueError( + f"Invalid `cache_implementation` ({self.cache_implementation}). Choose one of: " + f"{ALL_CACHE_IMPLEMENTATIONS}" + ) if self.cache_config is not None: cache_class = NEEDS_CACHE_CONFIG.get(self.cache_implementation) if cache_class is None: diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index f202e2fb2aab81..9981824b9611fd 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -29,15 +29,8 @@ Cache, DynamicCache, EncoderDecoderCache, - HQQQuantizedCache, - HybridCache, - MambaCache, OffloadedCache, - OffloadedStaticCache, QuantizedCacheConfig, - QuantoQuantizedCache, - SlidingWindowCache, - StaticCache, ) from ..integrations.deepspeed import is_deepspeed_zero3_enabled from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput @@ -68,7 +61,12 @@ _prepare_attention_mask, _prepare_token_type_ids, ) -from .configuration_utils import GenerationConfig, GenerationMode +from .configuration_utils import ( + NEED_SETUP_CACHE_CLASSES_MAPPING, + QUANT_BACKEND_CLASSES_MAPPING, + GenerationConfig, + GenerationMode, +) from .logits_process import ( EncoderNoRepeatNGramLogitsProcessor, EncoderRepetitionPenaltyLogitsProcessor, @@ -118,15 +116,6 @@ if is_accelerate_available(): from accelerate.hooks import AlignDevicesHook, add_hook_to_module -NEED_SETUP_CACHE_CLASSES_MAPPING = { - "static": StaticCache, - "offloaded_static": OffloadedStaticCache, - "sliding_window": SlidingWindowCache, - "hybrid": HybridCache, - "mamba": MambaCache, -} -QUANT_BACKEND_CLASSES_MAPPING = {"quanto": QuantoQuantizedCache, "HQQ": HQQQuantizedCache} - @dataclass class GenerateDecoderOnlyOutput(ModelOutput): diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index d3ca8eecbad482..0a353eda9b4826 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -551,7 +551,8 @@ def _init_weights(self, module: nn.Module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index bfa591f7bdafbd..46eea43e1285f8 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -429,7 +429,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 79aac224eda7ed..4703aa4af3efd6 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -721,7 +721,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index 1764dd7c5a67dd..d6eea9a12040ce 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -948,7 +948,8 @@ def _init_weights(self, module: nn.Module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index a9acd171c3aefe..a340689a7c3f60 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -836,7 +836,8 @@ def forward( returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 148eee115b6f7d..9b6a4522dddc91 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -726,7 +726,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 26b3c3a4e6edb3..0b3a7ed305eb86 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -716,7 +716,8 @@ def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 4807289c927cbf..43333e3d3338d1 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -578,7 +578,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index e59853677f83cc..72590862b749f0 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -610,7 +610,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 7be35c0d137d28..259f01fd3cb131 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -820,7 +820,8 @@ def forward( returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 1408bfe8a61d06..bd7ce5696fa077 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -621,7 +621,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index aec3bfa7ddfd0a..ff10b6e6d875f9 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -720,7 +720,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 8cd288a553fd9d..39546470434080 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -841,7 +841,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 5ad8d2f787da37..5c13819469b001 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -661,7 +661,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/mistral/modeling_tf_mistral.py b/src/transformers/models/mistral/modeling_tf_mistral.py index 40db52d99b8c33..5c21dd3c3f5334 100644 --- a/src/transformers/models/mistral/modeling_tf_mistral.py +++ b/src/transformers/models/mistral/modeling_tf_mistral.py @@ -728,8 +728,7 @@ class TFMistralPreTrainedModel(TFPreTrainedModel): blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. - Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + One formats is allowed: - Tuple of `tuple(tf.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index 719f3ff2fd172d..b79ff4e004fc83 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -735,7 +735,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index dae04b1a4251fa..fb5607263d9548 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -762,7 +762,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index eba9587ba350f5..3f086b989b2e97 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -576,7 +576,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 2850e043736c39..884566d6b2d08d 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -857,7 +857,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 6d1494b8eff573..1f066141433f1b 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -900,7 +900,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index c973e5bae7c5a1..f3dbe80d6e5169 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -765,7 +765,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index b5ac7058db55c9..be55cb9ebf09d8 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -920,7 +920,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index ac00adb9c731d2..d20bc57cb2ff60 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -852,7 +852,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index 9e09148cc65861..547463ec3d7cf2 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -738,7 +738,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format.