huggingface
diff --git a/‎flake.lock‎
Lines changed: 99 additions & 0 deletions b/‎flake.lock‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎flake.nix‎
Lines changed: 73 additions & 0 deletions b/‎flake.nix‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎server/text_generation_server/layers/attention/common.py‎
Lines changed: 2 additions & 2 deletions b/‎server/text_generation_server/layers/attention/common.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎server/text_generation_server/layers/attention/cuda.py‎
Lines changed: 42 additions & 4 deletions b/‎server/text_generation_server/layers/attention/cuda.py‎
Lines changed: 42 additions & 4 deletions
@@ -0,0 +1,73 @@
+{
+  inputs = {
+    tgi-nix.url = "github:danieldk/tgi-nix";
+    nixpkgs.follows = "tgi-nix/nixpkgs";
+    flake-utils.url = "github:numtide/flake-utils";
+  };
+  outputs =
+    {
+      self,
+      nixpkgs,
+      flake-utils,
+      tgi-nix,
+    }:
+    flake-utils.lib.eachDefaultSystem (
+      system:
+      let
+        config = {
+          allowUnfree = true;
+          cudaSupport = true;
+        };
+        pkgs = import nixpkgs {
+          inherit config system;
+          overlays = [ tgi-nix.overlay ];
+        };
+      in
+      {
+        devShells.default =
+          with pkgs;
+          mkShell {
+            buildInputs =
+              [
+                cargo
+                openssl.dev
+                pkg-config
+              ]
+              ++ (with python3.pkgs; [
+                venvShellHook
+                pip
+
+                einops
+                fbgemm-gpu
+                flash-attn
+                flash-attn-layer-norm
+                flash-attn-rotary
+                grpc-interceptor
+                grpcio-reflection
+                grpcio-status
+                hf-transfer
+                loguru
+                marlin-kernels
+                opentelemetry-api
+                opentelemetry-exporter-otlp
+                opentelemetry-instrumentation-grpc
+                opentelemetry-semantic-conventions
+                peft
+                tokenizers
+                torch
+                transformers
+                vllm
+              ]);
+
+            venvDir = "./.venv";
+
+            postVenv = ''
+              unset SOURCE_DATE_EPOCH
+            '';
+            postShellHook = ''
+              unset SOURCE_DATE_EPOCH
+            '';
+          };
+      }
+    );
+}
@@ -1,10 +1,10 @@
 from dataclasses import dataclass
-from text_generation_server.models.globals import FLASH_DECODING
+from text_generation_server.models.globals import FLASH_DECODING, FLASH_INFER
 import torch
 from typing import Optional
 
 
-if FLASH_DECODING:
+if FLASH_DECODING or FLASH_INFER:
 
     @dataclass
     class Seqlen:
 
@@ -1,6 +1,10 @@
 import torch
 from text_generation_server.utils.import_utils import SYSTEM
-from text_generation_server.models.globals import FLASH_DECODING, BLOCK_SIZE
+from text_generation_server.models.globals import (
+    FLASH_DECODING,
+    BLOCK_SIZE,
+    FLASH_INFER,
+)
 from text_generation_server.layers.attention import Seqlen
 from typing import Optional
 
@@ -23,7 +27,7 @@ def reshape_and_cache(
     value_cache: torch.Tensor,
     slots: torch.Tensor,
 ):
-    if FLASH_DECODING:
+    if FLASH_DECODING or FLASH_INFER:
         shape = key_cache.shape
         key_cache.view(-1, shape[-2], shape[-1])[slots] = key
         value_cache.view(-1, shape[-2], shape[-1])[slots] = value
@@ -72,7 +76,16 @@ def paged_attention(
     # V1 to avoid the overhead of reduction. Also, if the number of
     # sequences or heads is large, we use V1 since there is enough work
     # to parallelize.
-    if FLASH_DECODING:
+    if FLASH_INFER:
+        from text_generation_server.layers.attention.flash_infer import decode_state
+
+        return decode_state.get().forward(
+            query.contiguous(),
+            paged_kv_cache=(key_cache, value_cache),
+            logits_soft_cap=softcap,
+            sm_scale=softmax_scale,
+        )
+    elif FLASH_DECODING:
         max_q = 1
         max_k = max_s
         import flash_attn_2_cuda
@@ -206,7 +219,32 @@ def paged_attention(
 
 SUPPORTS_WINDOWING = V2
 
-if V2:
+if FLASH_INFER:
+
+    def attention(
+        q,
+        k,
+        v,
+        cu_seqlens,
+        max_s,
+        softmax_scale,
+        window_size_left=-1,
+        causal=True,
+        softcap=0.0,
+    ):
+        from text_generation_server.layers.attention.flash_infer import prefill_state
+
+        return prefill_state.get().forward(
+            q,
+            k,
+            v,
+            causal=causal,
+            window_left=window_size_left,
+            logits_soft_cap=softcap,
+            sm_scale=softmax_scale,
+        )
+
+elif V2:
 
     def attention(
         q,