microsoft · iofu728 · Jul 24, 2024 · Jul 24, 2024
diff --git a/README.md b/README.md
@@ -17,6 +17,7 @@ https://github.com/microsoft/MInference/assets/30883354/52613efc-738f-4081-8367-
 _Now, you can process **1M context 10x faster in a single A100** using Long-context LLMs like LLaMA-3-8B-1M, GLM-4-1M, with even **better accuracy**, try **MInference 1.0** right now!_
 
 ## News
+- 🥤 [24/07/24] MInference support [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) now.
 - 🪗 [24/07/07] Thanks @AK for sponsoring. You can now use MInference online in the [HF Demo](https://huggingface.co/spaces/microsoft/MInference) with ZeroGPU.
 - 📃 [24/07/03] Due to an issue with arXiv, the PDF is currently unavailable there. You can find the paper at this [link](https://export.arxiv.org/pdf/2407.02490).
 - 🧩 [24/07/03] We will present **MInference 1.0** at the _**Microsoft Booth**_ and _**ES-FoMo**_ at ICML'24. See you in Vienna!
@@ -60,6 +61,7 @@ get_support_models()
 ```
 
 Currently, we support the following LLMs:
+- LLaMA-3.1: [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
 - LLaMA-3: [gradientai/Llama-3-8B-Instruct-262k](https://huggingface.co/gradientai/Llama-3-8B-Instruct-262k), [gradientai/Llama-3-8B-Instruct-Gradient-1048k](https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k), [gradientai/Llama-3-8B-Instruct-Gradient-4194k](https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-4194k)
 - GLM-4: [THUDM/glm-4-9b-chat-1m](https://huggingface.co/THUDM/glm-4-9b-chat-1m)
 - Yi: [01-ai/Yi-9B-200K](https://huggingface.co/01-ai/Yi-9B-200K)

diff --git a/minference/configs/Llama_3.1_8B_Instruct_128k_kv_out_v32_fit_o_best_pattern.json b/minference/configs/Llama_3.1_8B_Instruct_128k_kv_out_v32_fit_o_best_pattern.json
diff --git a/minference/configs/model2path.py b/minference/configs/model2path.py
@@ -26,6 +26,9 @@
     "THUDM/glm-4-9b-chat-1m": os.path.join(
         BASE_DIR, "GLM_4_9B_1M_instruct_kv_out_v32_fit_o_best_pattern.json"
     ),
+    "meta-llama/Meta-Llama-3.1-8B-Instruct": os.path.join(
+        BASE_DIR, "Llama_3.1_8B_Instruct_128k_kv_out_v32_fit_o_best_pattern.json"
+    ),
 }
 
 

diff --git a/minference/modules/minference_forward.py b/minference/modules/minference_forward.py
@@ -8,6 +8,7 @@
 from importlib import import_module
 
 from transformers.models.llama.modeling_llama import *
+from transformers.utils import is_flash_attn_2_available
 from transformers.utils.import_utils import _is_package_available
 
 if _is_package_available("vllm"):
@@ -531,7 +532,7 @@ def forward(
             if os.path.exists(self.config_path):
                 config_list = json.load(open(self.config_path))
                 if self.layer_idx < len(config_list):
-                    assert False
+                    assert False, f"Search completed. The config is located in {self.config_path}."
             else:
                 config_list = []
             config = {}

diff --git a/minference/version.py b/minference/version.py
@@ -5,10 +5,10 @@
 _MINOR = "1"
 # On master and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "4"
+_PATCH = "5"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
-_SUFFIX = ".post4"
+_SUFFIX = ""
 
 VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
 VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)