From 21267f3ca3f121302b86c1702cc2da6091164c55 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Wed, 7 Aug 2024 21:32:37 -0400
Subject: [PATCH] add gptj modeling in TGI #2366 (CI RUN) (#2372)

* add gptj modeling

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* fix: update docs for model addition

* fix: adjust syntax typo

* fix: adjust syntax typo again

---------

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: Wang, Yi A <yi.a.wang@intel.com>
---
 docs/source/supported_models.md               |   1 +
 router/src/config.rs                          |   1 +
 .../text_generation_server/models/__init__.py |  43 ++
 .../custom_modeling/flash_gptj_modeling.py    | 405 ++++++++++++++++++
 4 files changed, 450 insertions(+)
 create mode 100644 server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py

diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
index bc124f3194c..b78104dfd9c 100644
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@@ -32,6 +32,7 @@ Text Generation Inference enables serving optimized models on specific hardware
 - [Mpt](https://huggingface.co/mosaicml/mpt-7b-instruct)
 - [Gpt2](https://huggingface.co/openai-community/gpt2)
 - [Gpt Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
+- [Gptj](https://huggingface.co/EleutherAI/gpt-j-6b)
 - [Idefics](https://huggingface.co/HuggingFaceM4/idefics-9b) (Multimodal)
 
 
diff --git a/router/src/config.rs b/router/src/config.rs
index 7737165e406..5d0be9c8b32 100644
--- a/router/src/config.rs
+++ b/router/src/config.rs
@@ -153,6 +153,7 @@ pub enum Config {
     Bloom,
     Mpt,
     Gpt2,
+    Gptj,
     GptNeox,
     Phi,
     #[serde(rename = "phi-msft")]
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index ae791ef8cd4..1f9c752661b 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -132,6 +132,9 @@
     from text_generation_server.models.custom_modeling.flash_gpt2_modeling import (
         FlashGPT2ForCausalLM,
     )
+    from text_generation_server.models.custom_modeling.flash_gptj_modeling import (
+        FlashGPTJForCausalLM,
+    )
     from text_generation_server.models.custom_modeling.idefics2 import (
         Idefics2ForConditionalGeneration,
     )
@@ -294,6 +297,11 @@ class ModelType(enum.Enum):
         "name": "Gpt Neox",
         "url": "https://huggingface.co/EleutherAI/gpt-neox-20b",
     }
+    GPTJ = {
+        "type": "gptj",
+        "name": "Gptj",
+        "url": "https://huggingface.co/EleutherAI/gpt-j-6b",
+    }
     IDEFICS = {
         "type": "idefics",
         "name": "Idefics",
@@ -641,6 +649,41 @@ def get_model(
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
+    elif model_type == GPTJ:
+        if FLASH_ATTENTION:
+            try:
+                return FlashCausalLM(
+                    model_id=model_id,
+                    model_class=FlashGPTJForCausalLM,
+                    revision=revision,
+                    quantize=quantize,
+                    speculator=speculator,
+                    dtype=dtype,
+                    trust_remote_code=trust_remote_code,
+                    lora_adapter_ids=lora_adapter_ids,
+                )
+            except RuntimeError as e:
+                # Lots of legacy models with various weight names.
+                log_master(logger.warning, f"Couldn't load flash gptj variant: {e}")
+                return CausalLM.fallback(
+                    model_id,
+                    revision,
+                    quantize=quantize,
+                    speculator=speculator,
+                    dtype=dtype,
+                    trust_remote_code=trust_remote_code,
+                )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-J"))
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
     elif model_type == GPT_NEOX:
         if FLASH_ATTENTION:
             from text_generation_server.models.custom_modeling.flash_neox_modeling import (
diff --git a/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
new file mode 100644
index 00000000000..eb667384343
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
@@ -0,0 +1,405 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+from text_generation_server.utils.import_utils import SYSTEM
+
+
+def load_attention(config, prefix: str, weights):
+    return TensorParallelColumnLinear.load_multi(
+        config,
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        dim=0,
+        weights=weights,
+        bias=False,
+    )
+
+
+def load_row(config, prefix: str, weights, bias: bool):
+    weight = weights.get_weights_row(prefix)
+
+    if bias and weights.process_group.rank() == 0:
+        # Rank is only on the first rank process
+        bias = weights.get_tensor(f"{prefix}.bias")
+    else:
+        bias = None
+
+    linear = get_linear(weight, bias)
+    return TensorParallelRowLinear(linear, process_group=weights.process_group)
+
+
+class GPTJRotary(PositionRotaryEmbedding):
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ):
+        # Such controlflows may add some overhead.
+        if SYSTEM == "cuda":
+            import rotary_emb
+
+            q1 = query[..., ::2]
+            q2 = query[..., 1::2]
+
+            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
+
+            k1 = key[..., ::2]
+            k2 = key[..., 1::2]
+
+            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
+        elif SYSTEM == "rocm":
+            from vllm._C import ops
+
+            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
+            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
+
+            head_size = query.shape[-1]
+
+            # Inplace operation, updating query and key.
+            ops.rotary_embedding(query, key, head_size, cos, sin, False)
+        elif SYSTEM == "ipex":
+            import intel_extension_for_pytorch as ipex
+
+            ipex.llm.functional.rotary_embedding(
+                query, key, sin, cos, query.size(-1), False
+            )
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
+
+
+class FlashGPTJAttention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+
+        self.head_size = self.hidden_size // self.num_heads
+        self.softmax_scale = self.head_size**-0.5
+        self.rotary_dim = config.rotary_dim
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+
+        self.query_key_value = load_attention(
+            config,
+            prefix=prefix,
+            weights=weights,
+        )
+
+        self.o_proj = load_row(
+            config,
+            prefix=f"{prefix}.out_proj",
+            weights=weights,
+            bias=False,
+        )
+
+        self.kv_head_mapping = torch.arange(
+            0, self.num_heads, dtype=torch.int32, device=weights.device
+        )
+
+        self.rotary_emb = GPTJRotary.static(
+            config=config,
+            dim=self.rotary_dim,
+            base=10000,
+            device=weights.device,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        query, key, value = self.query_key_value(hidden_states).split(
+            self.head_size * self.num_heads, dim=1
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_heads, self.head_size)
+        value = value.view(-1, self.num_heads, self.head_size)
+
+        # Compute rotary embeddings on rotary_ndims
+        if self.rotary_dim is not None:
+            self.rotary_emb(
+                query[..., : self.rotary_dim], key[..., : self.rotary_dim], cos, sin
+            )
+        else:
+            self.rotary_emb(query, key, cos, sin)
+
+        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attn_output = attention(
+                query,
+                key,
+                value,
+                cu_seqlen_prefill,
+                max_s,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                input_lengths,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class GPTJMLP(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        act = config.activation_function
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+
+        self.fc_in = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.fc_in", weights=weights, bias=True
+        )
+
+        self.fc_out = load_row(
+            config,
+            prefix=f"{prefix}.fc_out",
+            weights=weights,
+            bias=True,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        return self.fc_out(hidden_states)
+
+
+class FlashGPTJLayer(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        self.self_attn = FlashGPTJAttention(
+            prefix=f"{prefix}.attn", config=config, weights=weights
+        )
+        self.mlp = GPTJMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+        self.input_layernorm = FastLayerNorm.load(
+            prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        # Self Attention
+        attn_output = self.self_attn(
+            hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+
+        feed_forward_hidden_states = self.mlp(hidden_states)
+
+        return attn_output + feed_forward_hidden_states, residual
+
+
+class FlashGPTJModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        self.config = config
+
+        self.wte = TensorParallelEmbedding(prefix=f"{prefix}.wte", weights=weights)
+        self.layers = nn.ModuleList(
+            [
+                FlashGPTJLayer(
+                    prefix=(
+                        f"h.{layer_id}" if not prefix else f"{prefix}.h.{layer_id}"
+                    ),
+                    config=config,
+                    weights=weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+
+        self.ln_f = FastLayerNorm.load(
+            prefix="ln_f" if not prefix else f"{prefix}.ln_f",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = self.wte(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                input_lengths,
+                max_s,
+            )
+
+        hidden_states, _ = self.ln_f(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashGPTJForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+        self.model = FlashGPTJModel(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="lm_head",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor] = None,
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+            prefill_cache_indices=prefill_cache_indices,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits