Simplify the attention function (#2609)

* Simplify the `attention` function - Use one definition rather than multiple. - Add `key`/`value` arguments, so that we don't need the `PREFILL_IN_KVCACHE` constant. - Make it kwargs-only (to avoid mixing up the various `Tensor` args). * Fixup flashinfer support
huggingface · Oct 17, 2024 · 59ea38c · 59ea38c
1 parent 5bbe1ce
commit 59ea38c
Show file tree

Hide file tree

Showing 21 changed files with 316 additions and 466 deletions.
diff --git a/server/text_generation_server/layers/attention/__init__.py b/server/text_generation_server/layers/attention/__init__.py
@@ -8,23 +8,20 @@
     raise ImportError("`USE_FLASH_ATTENTION` is false.")
 if SYSTEM == "cuda":
     from .cuda import (
-        PREFILL_IN_KV_CACHE,
         SUPPORTS_WINDOWING,
         attention,
         paged_attention,
         reshape_and_cache,
     )
 elif SYSTEM == "rocm":
     from .rocm import (
-        PREFILL_IN_KV_CACHE,
         SUPPORTS_WINDOWING,
         attention,
         paged_attention,
         reshape_and_cache,
     )
 elif SYSTEM == "ipex":
     from .ipex import (
-        PREFILL_IN_KV_CACHE,
         SUPPORTS_WINDOWING,
         attention,
         paged_attention,
@@ -40,7 +37,6 @@
     "attention",
     "paged_attention",
     "reshape_and_cache",
-    "PREFILL_IN_KV_CACHE",
     "SUPPORTS_WINDOWING",
     "KVCache",
     "Seqlen",