diff --git a/docs/source/reference/launcher.md b/docs/source/reference/launcher.md
index b1abd1ee19b..68e487d0a73 100644
--- a/docs/source/reference/launcher.md
+++ b/docs/source/reference/launcher.md
@@ -93,10 +93,10 @@ Options:
 ## KV_CACHE_DTYPE
 ```shell
       --kv-cache-dtype <KV_CACHE_DTYPE>
-          Specify the dtype for the key-value cache. When this option is not provided, the dtype of the model is used (typically `float16` or `bfloat16`). Currently the only supported value is `fp8_e5m2` on CUDA
+          Specify the dtype for the key-value cache. When this option is not provided, the dtype of the model is used (typically `float16` or `bfloat16`). Currently the only supported value are `fp8_e4m3fn` and `fp8_e5m2` on CUDA
           
           [env: KV_CACHE_DTYPE=]
-          [possible values: fp8_e5m2]
+          [possible values: fp8_e4m3fn, fp8_e5m2]
 
 ```
 ## TRUST_REMOTE_CODE
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 0d7af66ddd7..d9f569fdaeb 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -307,6 +307,9 @@ impl std::fmt::Display for Dtype {
 
 #[derive(Clone, Copy, Debug, ValueEnum)]
 enum KVCacheDtype {
+    #[clap(name = "fp8_e4m3fn")]
+    Fp8e4m3fn,
+
     #[clap(name = "fp8_e5m2")]
     Fp8e5m2,
 }
@@ -314,6 +317,9 @@ enum KVCacheDtype {
 impl std::fmt::Display for KVCacheDtype {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
+            KVCacheDtype::Fp8e4m3fn => {
+                write!(f, "fp8_e4m3fn")
+            }
             KVCacheDtype::Fp8e5m2 => {
                 write!(f, "fp8_e5m2")
             }
@@ -424,7 +430,7 @@ struct Args {
 
     /// Specify the dtype for the key-value cache. When this option is not provided,
     /// the dtype of the model is used (typically `float16` or `bfloat16`). Currently
-    /// the only supported value is `fp8_e5m2` on CUDA.
+    /// the only supported value are `fp8_e4m3fn` and `fp8_e5m2` on CUDA.
     #[clap(long, env, value_enum)]
     kv_cache_dtype: Option<KVCacheDtype>,
 
diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py
index db390234e43..a363b33a89a 100644
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@@ -31,6 +31,7 @@ class Dtype(str, Enum):
 
 
 class KVCacheDtype(str, Enum):
+    fp8_e4m3fn = "fp8_e4m3fn"
     fp8_e5m2 = "fp8_e5m2"
 
 
diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py
index 3960c954985..7f1dd370ee8 100644
--- a/server/text_generation_server/layers/attention/kv_cache.py
+++ b/server/text_generation_server/layers/attention/kv_cache.py
@@ -24,11 +24,11 @@ def __init__(
     ):
         """Construct the key-value cache for a layer."""
 
-        if dtype == torch.float8_e5m2 and (
+        if dtype in {torch.float8_e5m2, torch.float8_e4m3fn} and (
             ATTENTION != "flashinfer" or SYSTEM != "cuda"
         ):
             raise ValueError(
-                "float8_e5m2 KV cache is currently only supported for flashinfer on CUDA"
+                "FP8 KV cache is currently only supported for flashinfer on CUDA"
             )
 
         element_size = torch.tensor([], dtype=dtype).element_size()
@@ -105,8 +105,8 @@ def store(
             # TODO: add scale
             key = key.to(key_cache.dtype)
             value = value.to(value_cache.dtype)
-            if key_cache.dtype == torch.float8_e5m2:
-                # Torch index_put does not support float8_e5m2 yet, so
+            if key_cache.dtype in {torch.float8_e5m2, torch.float8_e4m3fn}:
+                # Torch index_put does not support float8_{e5m2,e4m3fn} yet, so
                 # put as raw data instead.
                 key_cache = key_cache.view(torch.uint8)
                 value_cache = value_cache.view(torch.uint8)
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 019617d2baf..de0c66e7beb 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -421,6 +421,8 @@ def get_model(
 
     if kv_cache_dtype is None:
         kv_cache_dtype = dtype
+    elif kv_cache_dtype == "fp8_e4m3fn":
+        kv_cache_dtype = torch.float8_e4m3fn
     elif kv_cache_dtype == "fp8_e5m2":
         kv_cache_dtype = torch.float8_e5m2
     else: