add gptj modeling

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
huggingface · Aug 7, 2024 · 3e41ec2 · 3e41ec2
1 parent 133015f
commit 3e41ec2
Show file tree

Hide file tree

Showing 3 changed files with 449 additions and 0 deletions.
diff --git a/router/src/config.rs b/router/src/config.rs
@@ -153,6 +153,7 @@ pub enum Config {
     Bloom,
     Mpt,
     Gpt2,
+    Gptj,
     GptNeox,
     Phi,
     #[serde(rename = "phi-msft")]

diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
@@ -132,6 +132,9 @@
     from text_generation_server.models.custom_modeling.flash_gpt2_modeling import (
         FlashGPT2ForCausalLM,
     )
+    from text_generation_server.models.custom_modeling.flash_gptj_modeling import (
+        FlashGPTJForCausalLM,
+    )
     from text_generation_server.models.custom_modeling.idefics2 import (
         Idefics2ForConditionalGeneration,
     )
@@ -294,6 +297,11 @@ class ModelType(enum.Enum):
         "name": "Gpt Neox",
         "url": "https://huggingface.co/EleutherAI/gpt-neox-20b",
     }
+    GPTJ = {
+        "type": "gptj",
+        "name": "Gptj",
+        "url": "https://huggingface.co/EleutherAI/gpt-j-6b",
+    }
     IDEFICS = {
         "type": "idefics",
         "name": "Idefics",
@@ -641,6 +649,41 @@ def get_model(
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
+    elif model_type == GPTJ:
+        if FLASH_ATTENTION:
+            try:
+                return FlashCausalLM(
+                    model_id=model_id,
+                    model_class=FlashGPTJForCausalLM,
+                    revision=revision,
+                    quantize=quantize,
+                    speculator=speculator,
+                    dtype=dtype,
+                    trust_remote_code=trust_remote_code,
+                    lora_adapter_ids=lora_adapter_ids,
+                )
+            except RuntimeError as e:
+                # Lots of legacy models with various weight names.
+                log_master(logger.warning, f"Couldn't load flash gptj variant: {e}")
+                return CausalLM.fallback(
+                    model_id,
+                    revision,
+                    quantize=quantize,
+                    speculator=speculator,
+                    dtype=dtype,
+                    trust_remote_code=trust_remote_code,
+                )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-J"))
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
     elif model_type == GPT_NEOX:
         if FLASH_ATTENTION:
             from text_generation_server.models.custom_modeling.flash_neox_modeling import (
-Original file line number
+Diff line change
@@ Expand Up / @@ -153,6 +153,7 @@ pub enum Config { @@
         Bloom,
         Mpt,
         Gpt2,
+        Gptj,
         GptNeox,
         Phi,
         #[serde(rename = "phi-msft")]
@@ Expand Down @@