Fix petals.client, petals.models

borzunov · borzunov · commit 053abeb8acdd · 2023-06-22T12:20:35.000Z
diff --git a/setup.cfg b/setup.cfg
@@ -35,6 +35,7 @@ install_requires =
     bitsandbytes==0.38.0.post2
     accelerate>=0.16.0,<1.0.0
     huggingface-hub>=0.11.1,<1.0.0
+    tokenizers>=0.13.3
     transformers>=4.30.1,<5.0.0
     speedtest-cli==2.1.3
     hivemind==1.1.8
diff --git a/src/petals/client/lm_head.py b/src/petals/client/lm_head.py
@@ -1,13 +1,14 @@
 import dataclasses
 import platform
-from typing import Union
+from typing import Optional, Union
 
 import psutil
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from hivemind import get_logger
 from torch import nn
+from transformers import PretrainedConfig
 
 logger = get_logger(__name__)
 
@@ -21,15 +22,16 @@ class LMHeadConfig:
 
 
 class LMHead(nn.Module):
-    """
-    The modified language modeling head which does not create extra tensor for the linear layer with weights tied to the input
-    embeddings. Thus, it reduces initial memory consumption which might be crucial for large dictionaries.
-    In addition, it provides an effcient way to deal with half-precision word embeddings on CPU.
-    """
-
-    def __init__(self, config: LMHeadConfig, word_embeddings: nn.Embedding):
+    def __init__(self, config: PretrainedConfig):
         super().__init__()
-        self.word_embeddings = word_embeddings
+
+        if not config.tie_word_embeddings:
+            self.weight = nn.Parameter(torch.zeros((config.vocab_size, config.hidden_size), requires_grad=False))
+        else:
+            self.weight = None  # Will be set to get_input_embeddings().weight during loading the model
+        self.bias = None
+        self.in_features = config.hidden_size  # Similar to nn.Linear attributes
+        self.out_features = config.vocab_size
 
         self.use_chunked_forward = config.use_chunked_forward
         if self.use_chunked_forward == "auto":
@@ -45,35 +47,17 @@ def __init__(self, config: LMHeadConfig, word_embeddings: nn.Embedding):
         self.chunked_forward_step = config.chunked_forward_step
         self._bf16_warning_shown = False
 
-    @property
-    def in_features(self) -> int:
-        return self.word_embeddings.num_embeddings
-
-    @property
-    def out_features(self) -> int:
-        return self.word_embeddings.embedding_dim
-
-    @property
-    def weight(self):
-        return self.word_embeddings.weight
-
-    @property
-    def bias(self):
-        return None
-
     def forward(self, hidden_states):
-        word_embeddings = self.word_embeddings.weight
-
         if (
-            word_embeddings.dtype in [torch.float16, torch.bfloat16]
-            and word_embeddings.device.type == "cpu"
+            self.weight.dtype in [torch.float16, torch.bfloat16]
+            and self.weight.device.type == "cpu"
             and self.use_chunked_forward
         ):
             lm_logits = self.chunked_forward(hidden_states)
         else:
             # Switch dtype in case word_embeddings are fp16/bf16
-            hidden_states = hidden_states.to(word_embeddings.dtype)
-            lm_logits = F.linear(hidden_states, word_embeddings)
+            hidden_states = hidden_states.to(self.weight.dtype)
+            lm_logits = F.linear(hidden_states, self.weight)
         return lm_logits
 
     def chunked_forward(self, hidden_states):
@@ -83,20 +67,17 @@ def chunked_forward(self, hidden_states):
         assert self.chunked_forward_step > 0, "Chunk size for chunked forward must be positive"
 
         if not self._bf16_warning_shown:
-            if self.word_embeddings.weight.numel() * 4 < 0.9 * psutil.virtual_memory().total:
+            if self.weight.numel() * 4 < 0.9 * psutil.virtual_memory().total:
                 logger.warning(
                     "Running the client with dtype bfloat16 on CPU may be slow, since your CPU doesn't support AVX512. "
                     "Consider loading the model with torch_dtype='float32'"
                 )
             self._bf16_warning_shown = True
 
-        word_embeddings = self.word_embeddings.weight
-        num_embeddings = self.word_embeddings.num_embeddings
-
         hidden_states = hidden_states.float()
-        output = torch.empty(*hidden_states.shape[:-1], num_embeddings)
+        output = torch.empty(*hidden_states.shape[:-1], self.out_features)
 
-        for i in range(0, num_embeddings, self.chunked_forward_step):
-            chunk = word_embeddings[i : i + self.chunked_forward_step].float()
+        for i in range(0, self.out_features, self.chunked_forward_step):
+            chunk = self.weight[i : i + self.chunked_forward_step].float()
             output[..., i : i + self.chunked_forward_step] = F.linear(hidden_states, chunk)
         return output
diff --git a/src/petals/models/bloom/model.py b/src/petals/models/bloom/model.py
@@ -96,7 +96,7 @@ class DistributedBloomForCausalLM(FromPretrainedMixin, RemoteGenerationMixin, Bl
     _keys_to_ignore_on_load_missing = (
         BloomForCausalLM._keys_to_ignore_on_load_missing
         + DistributedBloomModel._keys_to_ignore_on_load_missing
-        + [r"^lm_head.word_embeddings\.weight$"]  # Missing since they are shared with input embeddings
+        + [r"^lm_head\."]  # Missing since they are shared with input embeddings
     )
     _keys_to_ignore_on_load_unexpected = DistributedBloomModel._keys_to_ignore_on_load_unexpected
 
@@ -105,16 +105,13 @@ class DistributedBloomForCausalLM(FromPretrainedMixin, RemoteGenerationMixin, Bl
     def __init__(self, config: DistributedBloomConfig):
         BloomPreTrainedModel.__init__(self, config)
         self.transformer = DistributedBloomModel(config)
-        self.lm_head = LMHead(config, self.transformer.word_embeddings)
+        self.lm_head = LMHead(config)
 
         # Initialize weights and apply final processing
         self.post_init()
 
     def get_output_embeddings(self):
-        return self.lm_head.word_embeddings
-
-    def set_output_embeddings(self, new_embeddings: torch.Tensor):
-        self.lm_head.word_embeddings = new_embeddings
+        return self.lm_head
 
 
 class DistributedBloomForSequenceClassification(FromPretrainedMixin, BloomForSequenceClassification):
diff --git a/src/petals/models/llama/model.py b/src/petals/models/llama/model.py
@@ -115,16 +115,13 @@ class DistributedLlamaForCausalLM(FromPretrainedMixin, RemoteGenerationMixin, Ll
     def __init__(self, config: DistributedLlamaConfig):
         LlamaPreTrainedModel.__init__(self, config)
         self.model = DistributedLlamaModel(config)
-        self.lm_head = LMHead(config, nn.Embedding(config.vocab_size, config.hidden_size))
+        self.lm_head = LMHead(config)
 
         # Initialize weights and apply final processing
         self.post_init()
 
     def get_output_embeddings(self):
-        return self.lm_head.word_embeddings
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head.word_embeddings = new_embeddings
+        return self.lm_head
 
     @property
     def transformer(self) -> DistributedLlamaModel:  # For compatibility with RemoteGenerationMixin