bigscience-workshop
diff --git a/‎src/petals/cli/run_server.py‎
Lines changed: 0 additions & 3 deletions b/‎src/petals/cli/run_server.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎src/petals/llama/__init__.py‎ b/‎src/petals/llama/__init__.py‎
diff --git a/‎src/petals/llama/block.py‎
Lines changed: 28 additions & 0 deletions b/‎src/petals/llama/block.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎src/petals/llama/modeling_utils.py‎
Lines changed: 99 additions & 0 deletions b/‎src/petals/llama/modeling_utils.py‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎src/petals/server/backend.py‎
Lines changed: 8 additions & 9 deletions b/‎src/petals/server/backend.py‎
Lines changed: 8 additions & 9 deletions
diff --git a/‎src/petals/server/block_utils.py‎
Lines changed: 5 additions & 7 deletions b/‎src/petals/server/block_utils.py‎
Lines changed: 5 additions & 7 deletions
@@ -89,9 +89,6 @@ def main():
     parser.add_argument('--alloc_timeout', type=float, default=60,
                         help='If the cache is full, the server will wait for this number of seconds hoping that some memory will be freed '
                              'before rejecting the request')
-    parser.add_argument('--revision', type=str, default='main',
-                        help="The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a git-based system for storing models"
-                             "and other artifacts on huggingface.co, so `revision` can be any identifier allowed by git.")
 
     parser.add_argument('--throughput',
                         type=lambda value: value if value in ['auto', 'eval'] else float(value),
 
@@ -0,0 +1,28 @@
+"""
+Bloom intermediate layer
+Based on https://github.com/huggingface/transformers/commit/ca2a55e9dfb245527b5e1c954fec6ffbb7aef07b
+See commit history for authorship.
+"""
+import os
+from typing import Optional, Tuple
+
+import torch.nn.quantized.dynamic.modules.linear
+import transformers
+from packaging import version
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+
+# if not os.getenv("PETALS_IGNORE_DEPENDENCY_VERSION"):
+#     assert (
+#         version.parse("4.25.1") <= version.parse(transformers.__version__) < version.parse("5.0.0")
+#     ), "Please install a proper transformers version: pip install transformers>=4.25.1,<5.0.0"
+
+
+class WrappedLlamaBlock(LlamaDecoderLayer):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        *args,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs
+    ):
+        return super().forward(hidden_states, *args, past_key_value=layer_past, **kwargs)
@@ -0,0 +1,99 @@
+"""
+PyTorch BLOOM model that implements several memory-efficient modes.
+Based on https://github.com/huggingface/transformers/commit/ca2a55e9dfb245527b5e1c954fec6ffbb7aef07b
+See commit history for authorship.
+"""
+
+import platform
+
+import psutil
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from hivemind import get_logger
+from torch import nn
+from transformers import BloomConfig
+
+logger = get_logger(__name__)
+
+
+class LMHead(nn.Module):
+    """
+    The modified language modeling head which does not create extra tensor for the linear layer with weights tied to the input
+    embeddings. Thus, it reduces initial memory consumption which might be crucial for large dictionaries.
+    In addition, it provides an effcient way to deal with half-precision word embeddings on CPU.
+    """
+
+    def __init__(self, config: BloomConfig, word_embeddings: nn.Embedding):
+        super().__init__()
+        self.word_embeddings = word_embeddings
+
+        self.use_chunked_forward = config.use_chunked_forward
+        if self.use_chunked_forward == "auto":
+            if platform.machine() == "x86_64":
+                # Import of cpufeature may crash on non-x86_64 machines
+                from cpufeature import CPUFeature
+
+                # If the CPU supports AVX512, plain bfloat16 is ~10x faster than chunked_forward().
+                # Otherwise, it's ~8x slower.
+                self.use_chunked_forward = not (CPUFeature["AVX512f"] and CPUFeature["OS_AVX512"])
+            else:
+                self.use_chunked_forward = True
+        self.chunked_forward_step = config.chunked_forward_step
+        self._bf16_warning_shown = False
+
+    @property
+    def in_features(self) -> int:
+        return self.word_embeddings.num_embeddings
+
+    @property
+    def out_features(self) -> int:
+        return self.word_embeddings.embedding_dim
+
+    @property
+    def weight(self):
+        return self.word_embeddings.weight
+
+    @property
+    def bias(self):
+        return None
+
+    def forward(self, hidden_states):
+        word_embeddings = self.word_embeddings.weight
+
+        if (
+            word_embeddings.dtype in [torch.float16, torch.bfloat16]
+            and word_embeddings.device.type == "cpu"
+            and self.use_chunked_forward
+        ):
+            lm_logits = self.chunked_forward(hidden_states)
+        else:
+            # Switch dtype in case word_embeddings are fp16/bf16
+            hidden_states = hidden_states.to(word_embeddings.dtype)
+            lm_logits = F.linear(hidden_states, word_embeddings)
+        return lm_logits
+
+    def chunked_forward(self, hidden_states):
+        """Splits word embeddings on chunks and iteratively casts them into fp32 to perform matmul more efficiently on CPU.
+        chunked_forward_step: provides trade-off between efficiency and extra memory consumption.
+        """
+        assert self.chunked_forward_step > 0, "Chunk size for chunked forward must be positive"
+
+        if not self._bf16_warning_shown:
+            if self.word_embeddings.weight.numel() * 4 < 0.9 * psutil.virtual_memory().total:
+                logger.warning(
+                    "Running the client with dtype bfloat16 on CPU may be slow, since your CPU doesn't support AVX512. "
+                    "Consider loading the model with torch_dtype='float32'"
+                )
+            self._bf16_warning_shown = True
+
+        word_embeddings = self.word_embeddings.weight
+        num_embeddings = self.word_embeddings.num_embeddings
+
+        hidden_states = hidden_states.float()
+        output = torch.empty(*hidden_states.shape[:-1], num_embeddings)
+
+        for i in range(0, num_embeddings, self.chunked_forward_step):
+            chunk = word_embeddings[i : i + self.chunked_forward_step].float()
+            output[..., i : i + self.chunked_forward_step] = F.linear(hidden_states, chunk)
+        return output
@@ -1,4 +1,3 @@
-"""Code for serving bloom blocks via hivemind-server"""
 from __future__ import annotations
 
 from collections import Counter
@@ -12,8 +11,7 @@
 from hivemind.utils import get_logger
 from tensor_parallel import TensorParallel
 from tensor_parallel.tensor_parallel import PerDeviceTensors
-from transformers import BloomConfig
-from transformers.models.bloom.modeling_bloom import BloomAttention
+from transformers import PretrainedConfig
 
 from petals.data_structures import InferenceMetadata
 from petals.server.memory_cache import Handle, MemoryCache
@@ -24,17 +22,17 @@
 
 
 class TransformerBackend(ModuleBackend):
-    """A wrapper for a BLOOM block that can process requests for BLOOM layer forward, backward and inference"""
+    """A wrapper for a transformer block that can process requests for forward, backward and inference"""
 
-    def __init__(self, *args, config: BloomConfig, memory_cache: MemoryCache, backend_dtype: torch.dtype, **kwargs):
+    def __init__(self, *args, config: PretrainedConfig, memory_cache: MemoryCache, backend_dtype: torch.dtype, **kwargs):
         super().__init__(*args, **kwargs)
         assert isinstance(self.module, TensorParallel)
         self.config = config
         self.memory_cache = memory_cache
         for name, param in self.module.named_parameters():
-            assert not param.requires_grad, f"Bloom layer parameters must not accumulate gradients, but {name} does"
+            assert not param.requires_grad, f"Block parameters must not accumulate gradients, but {name} does"
         for name, buf in self.module.named_buffers():
-            assert not buf.requires_grad, f"Bloom layer parameters must not accumulate gradients, but {name} does"
+            assert not buf.requires_grad, f"Block parameters must not accumulate gradients, but {name} does"
 
         max_batch_size = self.forward_pool.max_batch_size
         device = self.module.devices[self.module.output_device_index]
@@ -53,9 +51,10 @@ def __init__(self, *args, config: BloomConfig, memory_cache: MemoryCache, backen
         self.shard_num_heads = []
         for shard in self.module.module_shards:
             for submodule in shard.modules():
-                if isinstance(submodule, BloomAttention):
+                if isinstance(submodule, config.attn_class):
                     self.shard_num_heads.append(submodule.num_heads)
-        assert len(self.shard_num_heads) == len(self.module.devices) and sum(self.shard_num_heads) == config.n_head
+        assert len(self.shard_num_heads) == len(self.module.devices)
+        assert sum(self.shard_num_heads) == config.n_head
 
         self.inference_schema = (
             (
 
@@ -2,13 +2,11 @@
 
 import torch
 from accelerate import init_empty_weights
-from transformers import BloomConfig
+from transformers import PretrainedConfig
 
-from petals.bloom.block import WrappedBloomBlock
 
-
-def resolve_block_dtype(config: BloomConfig, dtype: Union[str, torch.dtype]) -> Union[str, torch.dtype]:
-    """If dtype is "auto", resolves it using BloomConfig. Returns `dtype` intact otherwise."""
+def resolve_block_dtype(config: PretrainedConfig, dtype: Union[str, torch.dtype]) -> Union[str, torch.dtype]:
+    """If dtype is "auto", resolves it using the config. Returns `dtype` intact otherwise."""
 
     if dtype == "auto" or dtype is None:
         dtype = config.torch_dtype
@@ -18,7 +16,7 @@ def resolve_block_dtype(config: BloomConfig, dtype: Union[str, torch.dtype]) ->
 
 
 def get_block_size(
-    config: BloomConfig,
+    config: PretrainedConfig,
     location: str,
     *,
     dtype: Optional[Union[str, torch.dtype]] = None,
@@ -31,7 +29,7 @@ def get_block_size(
         ), 'get_block_size(..., location="memory") requires to specify dtype and load_in_8bit for calculations'
 
     with init_empty_weights(include_buffers=True):
-        block = WrappedBloomBlock(config)
+        block = config.block_class(config)
         n_params = sum(param.numel() for param in block.parameters())
 
     if location == "memory" and load_in_8bit: