add gptj modeling in TGI #2366 (CI RUN) (#2372)

* add gptj modeling Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix: update docs for model addition * fix: adjust syntax typo * fix: adjust syntax typo again --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Wang, Yi A <yi.a.wang@intel.com>
huggingface · Aug 8, 2024 · 21267f3 · 21267f3
1 parent 8094ecf
commit 21267f3
Show file tree

Hide file tree

Showing 4 changed files with 450 additions and 0 deletions.
diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
@@ -32,6 +32,7 @@ Text Generation Inference enables serving optimized models on specific hardware
 - [Mpt](https://huggingface.co/mosaicml/mpt-7b-instruct)
 - [Gpt2](https://huggingface.co/openai-community/gpt2)
 - [Gpt Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
+- [Gptj](https://huggingface.co/EleutherAI/gpt-j-6b)
 - [Idefics](https://huggingface.co/HuggingFaceM4/idefics-9b) (Multimodal)
 
 

diff --git a/router/src/config.rs b/router/src/config.rs
@@ -153,6 +153,7 @@ pub enum Config {
     Bloom,
     Mpt,
     Gpt2,
+    Gptj,
     GptNeox,
     Phi,
     #[serde(rename = "phi-msft")]

diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
@@ -132,6 +132,9 @@
     from text_generation_server.models.custom_modeling.flash_gpt2_modeling import (
         FlashGPT2ForCausalLM,
     )
+    from text_generation_server.models.custom_modeling.flash_gptj_modeling import (
+        FlashGPTJForCausalLM,
+    )
     from text_generation_server.models.custom_modeling.idefics2 import (
         Idefics2ForConditionalGeneration,
     )
@@ -294,6 +297,11 @@ class ModelType(enum.Enum):
         "name": "Gpt Neox",
         "url": "https://huggingface.co/EleutherAI/gpt-neox-20b",
     }
+    GPTJ = {
+        "type": "gptj",
+        "name": "Gptj",
+        "url": "https://huggingface.co/EleutherAI/gpt-j-6b",
+    }
     IDEFICS = {
         "type": "idefics",
         "name": "Idefics",
@@ -641,6 +649,41 @@ def get_model(
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
+    elif model_type == GPTJ:
+        if FLASH_ATTENTION:
+            try:
+                return FlashCausalLM(
+                    model_id=model_id,
+                    model_class=FlashGPTJForCausalLM,
+                    revision=revision,
+                    quantize=quantize,
+                    speculator=speculator,
+                    dtype=dtype,
+                    trust_remote_code=trust_remote_code,
+                    lora_adapter_ids=lora_adapter_ids,
+                )
+            except RuntimeError as e:
+                # Lots of legacy models with various weight names.
+                log_master(logger.warning, f"Couldn't load flash gptj variant: {e}")
+                return CausalLM.fallback(
+                    model_id,
+                    revision,
+                    quantize=quantize,
+                    speculator=speculator,
+                    dtype=dtype,
+                    trust_remote_code=trust_remote_code,
+                )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-J"))
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
     elif model_type == GPT_NEOX:
         if FLASH_ATTENTION:
             from text_generation_server.models.custom_modeling.flash_neox_modeling import (
-Original file line number
+Diff line change
@@ Expand Up / @@ -153,6 +153,7 @@ pub enum Config { @@
         Bloom,
         Mpt,
         Gpt2,
+        Gptj,
         GptNeox,
         Phi,
         #[serde(rename = "phi-msft")]
@@ Expand Down @@