diff --git a/Dockerfile b/Dockerfile index bcdec85..8c8377f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,6 @@ -FROM nvidia/cuda:11.6.0-devel-ubi8 as cuda +FROM nvidia/cuda:11.6.1-devel-ubi8 as base -ENV PORT=5000 - -WORKDIR /src - -FROM cuda as conda +RUN dnf install -y --disableplugin=subscription-manager make git && dnf clean all --disableplugin=subscription-manager # taken form pytorch's dockerfile RUN curl -L -o ./miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ @@ -21,47 +17,55 @@ RUN conda create -n inference python=${PYTHON_VERSION} pip -y # change shell to activate env SHELL ["conda", "run", "-n", "inference", "/bin/bash", "-c"] -FROM conda as conda_env +FROM base as conda + +# update conda +RUN conda update -n base -c defaults conda -y +# cmake +RUN conda install -c anaconda cmake -y # update conda RUN conda update -n base -c defaults conda -y # necessary stuff RUN pip install torch==1.12.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116 \ - transformers \ - deepspeed==0.7.5 \ - deepspeed-mii==0.0.2 \ - accelerate \ - gunicorn \ + transformers==4.25.1 \ + deepspeed==0.7.6 \ + accelerate==0.15.0 \ + gunicorn==20.1.0 \ flask \ - flask_api \ - pydantic \ - huggingface_hub \ + flask_api \ + fastapi==0.89.1 \ + uvicorn==0.19.0 \ + jinja2==3.1.2 \ + pydantic==1.10.2 \ + huggingface_hub==0.10.1 \ grpcio-tools==1.50.0 \ --no-cache-dir -# copy the code -COPY inference_server inference_server -COPY Makefile Makefile -COPY LICENSE LICENSE - -# install grpc and compile protos -RUN make gen-proto - # clean conda env RUN conda clean -ya -EXPOSE ${PORT} - # change this as you like 🤗 -ENV TRANSFORMERS_CACHE=/transformers_cache/ \ - HUGGINGFACE_HUB_CACHE=${TRANSFORMERS_CACHE} \ - HOME=/homedir +ENV TRANSFORMERS_CACHE=/cos/HF_cache \ + HUGGINGFACE_HUB_CACHE=${TRANSFORMERS_CACHE} -RUN mkdir ${HOME} && chmod g+wx ${HOME} && \ - mkdir tmp && chmod -R g+w tmp +FROM conda as app -# for debugging -# RUN chmod -R g+w inference_server && chmod g+w Makefile +WORKDIR /src +RUN chmod -R g+w /src + +RUN mkdir /.cache && \ + chmod -R g+w /.cache -CMD make bloom-176b +ENV PORT=5000 \ + UI_PORT=5001 +EXPOSE ${PORT} +EXPOSE ${UI_PORT} + +CMD git clone https://github.com/huggingface/transformers-bloom-inference.git && \ + cd transformers-bloom-inference && \ + # install grpc and compile protos + make gen-proto && \ + make ui && \ + make bloom-560m diff --git a/Makefile b/Makefile index a26c359..1be6c53 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,4 @@ gen-proto: - pip install grpcio-tools==1.50.0 --no-cache-dir - mkdir -p inference_server/model_handler/grpc_utils/pb python -m grpc_tools.protoc -Iinference_server/model_handler/grpc_utils/proto --python_out=inference_server/model_handler/grpc_utils/pb --grpc_python_out=inference_server/model_handler/grpc_utils/pb inference_server/model_handler/grpc_utils/proto/generation.proto @@ -100,3 +98,6 @@ codegen-mono: MAX_BATCH_SIZE=4 \ CUDA_VISIBLE_DEVICES=0 \ gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s' + +ui: + python -m ui & diff --git a/inference_server/constants.py b/inference_server/constants.py index 9221bf0..ba966f6 100644 --- a/inference_server/constants.py +++ b/inference_server/constants.py @@ -3,8 +3,4 @@ DS_INFERENCE = "ds_inference" DS_ZERO = "ds_zero" -# model weights -DS_INFERENCE_BLOOM_FP16 = "microsoft/bloom-deepspeed-inference-fp16" -DS_INFERENCE_BLOOM_INT8 = "microsoft/bloom-deepspeed-inference-int8" - # GRPC_MAX_MSG_SIZE = 2**30 # 1GB diff --git a/inference_server/download_model.py b/inference_server/download_model.py index 6ba254c..5055af9 100644 --- a/inference_server/download_model.py +++ b/inference_server/download_model.py @@ -1,6 +1,7 @@ import argparse -from .models import get_downloaded_model_path +from inference_server.models import get_hf_model_class +from transformers import AutoConfig, AutoTokenizer def get_args() -> argparse.Namespace: @@ -12,6 +13,12 @@ def get_args() -> argparse.Namespace: required=True, help="model to use", ) + parser.add_argument( + "--model_class", + type=str, + required=True, + help="model class to use", + ) args = parser.parse_args() @@ -20,7 +27,10 @@ def get_args() -> argparse.Namespace: def main() -> None: args = get_args() - get_downloaded_model_path(args.model_name) + print("downloading", args.model_name) + AutoConfig.from_pretrained(args.model_name) + AutoTokenizer.from_pretrained(args.model_name) + get_hf_model_class(args.model_class).from_pretrained(args.model_name) if __name__ == "__main__": diff --git a/inference_server/model_handler/deployment.py b/inference_server/model_handler/deployment.py index edbeb24..86264ec 100644 --- a/inference_server/model_handler/deployment.py +++ b/inference_server/model_handler/deployment.py @@ -9,11 +9,9 @@ from typing import List import grpc -from mii.server_client import MIIServerClient -from transformers import AutoTokenizer from ..constants import DS_INFERENCE, DS_ZERO -from ..models import get_downloaded_model_path, get_model_class, load_tokenizer +from ..models import get_model_class, load_tokenizer from ..utils import ( GenerateResponse, TokenizeRequest, @@ -25,14 +23,14 @@ from .grpc_utils.pb import generation_pb2, generation_pb2_grpc -class ModelDeployment(MIIServerClient): +class ModelDeployment: def __init__(self, args: argparse.Namespace, use_grpc_server: bool = False, cuda_visible_devices: List[int] = [0]): self.cuda_visible_devices = cuda_visible_devices self.num_gpus = len(self.cuda_visible_devices) self.use_grpc_server = use_grpc_server if self.use_grpc_server: - self.tokenizer = load_tokenizer(get_downloaded_model_path(args.model_name)) + self.tokenizer = load_tokenizer(args.model_name) self.initialize_ports() @@ -57,6 +55,27 @@ def initialize_ports(self): for i in range(self.num_gpus): self.ports.append(50950 + self.cuda_visible_devices[i]) + def _is_socket_open(self, port): + import socket + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + result = sock.connect_ex(("0.0.0.0", port)) + sock.close() + return result == 0 + + def _is_server_process_alive(self): + if self.process is None: + return True + try: + self.process.wait(1) + except subprocess.TimeoutExpired as err: + # timeout means we're still running and all (probably) okay + is_alive = True + else: + # no exception case + is_alive = False + return is_alive + def _wait_until_server_is_live(self): sockets_open = False while not sockets_open: diff --git a/inference_server/models/__init__.py b/inference_server/models/__init__.py index 6677389..c4d26a3 100644 --- a/inference_server/models/__init__.py +++ b/inference_server/models/__init__.py @@ -1,5 +1,5 @@ from ..constants import DS_INFERENCE, DS_ZERO, HF_ACCELERATE -from .model import Model, get_downloaded_model_path, load_tokenizer +from .model import Model, get_hf_model_class, load_tokenizer def get_model_class(deployment_framework: str): diff --git a/inference_server/models/ds_inference.py b/inference_server/models/ds_inference.py index d33329f..91e35ab 100644 --- a/inference_server/models/ds_inference.py +++ b/inference_server/models/ds_inference.py @@ -9,10 +9,11 @@ import torch.distributed as dist import deepspeed -from transformers import AutoConfig, AutoTokenizer +from huggingface_hub import try_to_load_from_cache +from transformers import AutoConfig from ..utils import print_rank_n, run_rank_n -from .model import Model, get_downloaded_model_path, get_hf_model_class, load_tokenizer +from .model import Model, get_hf_model_class # basic DeepSpeed inference model class for benchmarking @@ -24,26 +25,23 @@ def __init__(self, args: Namespace) -> None: world_size = int(os.getenv("WORLD_SIZE", "1")) - downloaded_model_path = get_downloaded_model_path(args.model_name) - - self.tokenizer = load_tokenizer(downloaded_model_path) - self.pad = self.tokenizer.pad_token_id - # create dummy tensors for allocating space which will be filled with # the actual weights while calling deepspeed.init_inference in the # following code with deepspeed.OnDevice(dtype=torch.float16, device="meta"): self.model = get_hf_model_class(args.model_class).from_config( - AutoConfig.from_pretrained(downloaded_model_path), torch_dtype=torch.bfloat16 + AutoConfig.from_pretrained(args.model_name), torch_dtype=torch.bfloat16 ) self.model = self.model.eval() + downloaded_model_path = get_model_path(args.model_name) + if args.dtype in [torch.float16, torch.int8]: # We currently support the weights provided by microsoft (which are # pre-sharded) - if args.use_pre_sharded_checkpoints: - checkpoints_json = os.path.join(downloaded_model_path, "ds_inference_config.json") + checkpoints_json = os.path.join(downloaded_model_path, "ds_inference_config.json") + if os.path.isfile(checkpoints_json): self.model = deepspeed.init_inference( self.model, mp_size=world_size, @@ -60,6 +58,7 @@ def __init__(self, args: Namespace) -> None: self.model = deepspeed.init_inference( self.model, mp_size=world_size, + base_dir=downloaded_model_path, dtype=args.dtype, checkpoint=checkpoints_json, replace_with_kernel_inject=True, @@ -74,6 +73,8 @@ def __init__(self, args: Namespace) -> None: print_rank_n("Model loaded") dist.barrier() + self.post_init(args.model_name) + class TemporaryCheckpointsJSON: def __init__(self, model_path: str): @@ -93,3 +94,16 @@ def __enter__(self): def __exit__(self, type, value, traceback): return + + +def get_model_path(model_name: str): + config_file = "config.json" + + # will fall back to HUGGINGFACE_HUB_CACHE + config_path = try_to_load_from_cache(model_name, config_file, cache_dir=os.getenv("TRANSFORMERS_CACHE")) + + if config_path is not None: + return os.path.dirname(config_path) + # treat the model name as an explicit model path + elif os.path.isfile(os.path.join(model_name, config_file)): + return model_name diff --git a/inference_server/models/ds_zero.py b/inference_server/models/ds_zero.py index 5cc4b54..bc9c20a 100644 --- a/inference_server/models/ds_zero.py +++ b/inference_server/models/ds_zero.py @@ -5,11 +5,11 @@ import torch.distributed as dist import deepspeed -from transformers import AutoConfig, AutoTokenizer +from transformers import AutoConfig from transformers.deepspeed import HfDeepSpeedConfig from ..utils import print_rank_n -from .model import Model, get_downloaded_model_path, get_hf_model_class, load_tokenizer +from .model import Model, get_hf_model_class class DSZeROModel(Model): @@ -18,9 +18,7 @@ def __init__(self, args: Namespace) -> None: super().__init__(args) - downloaded_model_path = get_downloaded_model_path(args.model_name) - - config = AutoConfig.from_pretrained(downloaded_model_path) + config = AutoConfig.from_pretrained(args.model_name) world_size = int(os.getenv("WORLD_SIZE", "1")) train_batch_size = 1 * world_size @@ -54,12 +52,7 @@ def __init__(self, args: Namespace) -> None: # this tells from_pretrained to instantiate directly on gpus dschf = HfDeepSpeedConfig(ds_config) - self.tokenizer = load_tokenizer(downloaded_model_path) - self.pad = self.tokenizer.pad_token_id - - self.model = get_hf_model_class(args.model_class).from_pretrained( - downloaded_model_path, torch_dtype=args.dtype - ) + self.model = get_hf_model_class(args.model_class).from_pretrained(args.model_name, torch_dtype=args.dtype) self.model = self.model.eval() # convert model to a fully sharded model using ZeRO @@ -74,3 +67,5 @@ def __init__(self, args: Namespace) -> None: print_rank_n("Model loaded") dist.barrier() + + self.post_init(args.model_name) diff --git a/inference_server/models/hf_accelerate.py b/inference_server/models/hf_accelerate.py index 32cac7f..487ed33 100644 --- a/inference_server/models/hf_accelerate.py +++ b/inference_server/models/hf_accelerate.py @@ -2,10 +2,8 @@ import torch -from transformers import AutoModelForCausalLM, AutoTokenizer - from ..utils import print_rank_n -from .model import Model, get_downloaded_model_path, get_hf_model_class, load_tokenizer +from .model import Model, get_hf_model_class class HFAccelerateModel(Model): @@ -14,12 +12,7 @@ def __init__(self, args: Namespace) -> None: super().__init__(args) - downloaded_model_path = get_downloaded_model_path(args.model_name) - - self.tokenizer = load_tokenizer(downloaded_model_path) - self.pad = self.tokenizer.pad_token_id - - kwargs = {"pretrained_model_name_or_path": downloaded_model_path, "device_map": "auto"} + kwargs = {"pretrained_model_name_or_path": args.model_name, "device_map": "auto"} if len(args.cuda_visible_devices) > 1: kwargs["device_map"] = "balanced_low_0" @@ -39,3 +32,5 @@ def __init__(self, args: Namespace) -> None: self.input_device = "cuda:0" print_rank_n("Model loaded") + + self.post_init(args.model_name) diff --git a/inference_server/models/model.py b/inference_server/models/model.py index 2dda1ab..2b1e809 100644 --- a/inference_server/models/model.py +++ b/inference_server/models/model.py @@ -1,30 +1,32 @@ import argparse -import os -from functools import partial from typing import Union import torch import transformers -from huggingface_hub import snapshot_download from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer -from transformers.utils import is_offline_mode -from ..utils import GenerateRequest, GenerateResponse, GenerationMixin, TokenizeRequest, TokenizeResponse, run_rank_n +from ..utils import GenerateRequest, GenerateResponse, TokenizeRequest, TokenizeResponse class Model: def __init__(self, args: argparse.Namespace) -> None: - self.tokenizer = None - self.pad = None self.model = None self.input_device = None self.max_input_length = args.max_input_length self.max_batch_size = args.max_batch_size + def post_init(self, model_name: str) -> None: + self.tokenizer = load_tokenizer(model_name) + + self.pad = self.tokenizer.pad_token_id + self.prefix_token_id = self.tokenizer("A")["input_ids"][0] + def generate(self, request: GenerateRequest) -> Union[GenerateResponse, Exception]: try: - check_batch_size(len(request.text), self.max_batch_size) + batch_size = len(request.text) + + check_batch_size(batch_size, self.max_batch_size) input_tokens = self.tokenizer(request.text, return_tensors="pt", padding=True) max_input_length_in_batch = input_tokens.input_ids[0].shape[0] @@ -35,7 +37,9 @@ def generate(self, request: GenerateRequest) -> Union[GenerateResponse, Exceptio if torch.is_tensor(input_tokens[t]): input_tokens[t] = input_tokens[t].to(self.input_device) - output = GenerationMixin(self.model).generate( + num_input_tokens = input_tokens["input_ids"].shape[1] + + output = self.model.generate( **input_tokens, min_length=request.min_length, do_sample=request.do_sample, @@ -63,16 +67,22 @@ def generate(self, request: GenerateRequest) -> Union[GenerateResponse, Exceptio ) output_tokens = output.sequences - num_generated_tokens = output.num_generated_tokens.tolist() + + generated_tokens = output_tokens[:, num_input_tokens:] + num_generated_tokens = (generated_tokens != self.pad).sum(dim=-1).tolist() if request.remove_input_from_output: + # create the dummy prefix for detokenization + prefix_to_add = torch.tensor([[self.prefix_token_id]] * batch_size).to(self.input_device) # the generate method's output includes input too. Remove input if # that is requested by the user - output_tokens = [x[-i:] if i != 0 else [] for x, i in zip(output_tokens, num_generated_tokens)] - - output_text = self.tokenizer.batch_decode(output_tokens, skip_special_tokens=True) + generated_tokens = torch.cat([prefix_to_add, generated_tokens], dim=1) + generated_text = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) + generated_text = [i[1:] for i in generated_text] + else: + generated_text = self.tokenizer.batch_decode(output_tokens, skip_special_tokens=True) - return GenerateResponse(text=output_text, num_generated_tokens=num_generated_tokens) + return GenerateResponse(text=generated_text, num_generated_tokens=num_generated_tokens) except Exception as exception: return exception @@ -81,21 +91,6 @@ def tokenize(self, request: TokenizeRequest) -> TokenizeResponse: return TokenizeResponse(token_ids=response.input_ids, attention_mask=response.attention_mask) -def get_downloaded_model_path(model_name: str): - f = partial( - snapshot_download, - repo_id=model_name, - local_files_only=is_offline_mode(), - cache_dir=os.getenv("TRANSFORMERS_CACHE", None), - # maybe move to safetensors in the future - ignore_patterns=["*.safetensors", "*.msgpack", "*.h5", "*log*", "*evaluation*", "tensorboard"], - ) - # download only on 1 process - run_rank_n(f, barrier=True) - # now since the snapshot is downloaded, pass the model_path to all processes - return f() - - def check_max_input_length(input_token_length: int, max_input_length: int) -> None: if max_input_length is None: return @@ -118,10 +113,9 @@ def get_hf_model_class(model_class: str) -> Union[AutoModelForCausalLM, AutoMode def load_tokenizer(model_name: str) -> AutoTokenizer: - tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") if tokenizer.pad_token_id is None: tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - tokenizer.padding_side = "left" return tokenizer diff --git a/inference_server/utils/__init__.py b/inference_server/utils/__init__.py index d4c074b..1e33f98 100644 --- a/inference_server/utils/__init__.py +++ b/inference_server/utils/__init__.py @@ -1,4 +1,3 @@ -from .generation_utils import GenerationMixin from .requests import ( GenerateRequest, GenerateResponse, diff --git a/inference_server/utils/generation_utils.py b/inference_server/utils/generation_utils.py deleted file mode 100644 index d970bf7..0000000 --- a/inference_server/utils/generation_utils.py +++ /dev/null @@ -1,389 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Note: this is a minimalistic copy of transformers.generation_utils and only -# supports greedy decoding and sampling - -import warnings -from dataclasses import dataclass -from typing import Any, Optional, Union - -import torch -import torch.distributed as dist -from torch import nn - -import transformers -from transformers.generation_logits_process import LogitsProcessorList -from transformers.generation_stopping_criteria import StoppingCriteriaList, validate_stopping_criteria -from transformers.generation_utils import ( - GreedySearchDecoderOnlyOutput, - GreedySearchOutput, - SampleDecoderOnlyOutput, - SampleEncoderDecoderOutput, - SampleOutput, -) - - -@dataclass -class GreedySearchEncoderDecoderOutput(transformers.generation_utils.GreedySearchEncoderDecoderOutput): - num_generated_tokens: torch.LongTensor = None - - -@dataclass -class GreedySearchDecoderOnlyOutput(transformers.generation_utils.GreedySearchDecoderOnlyOutput): - num_generated_tokens: torch.LongTensor = None - - -@dataclass -class SampleEncoderDecoderOutput(transformers.generation_utils.SampleEncoderDecoderOutput): - num_generated_tokens: torch.LongTensor = None - - -@dataclass -class SampleDecoderOnlyOutput(transformers.generation_utils.SampleDecoderOnlyOutput): - num_generated_tokens: torch.LongTensor = None - - -class GenerationMixin(transformers.generation_utils.GenerationMixin): - def __init__(self, model) -> None: - super().__init__() - self.model = model - - def __getattr__(self, name): - try: - return super().__getattr__(name) - except AttributeError: - return getattr(self.model, name) - - def __call__(self, *args: Any, **kwds: Any) -> Any: - return self.model(*args, **kwds) - - def greedy_search( - self, - input_ids: torch.LongTensor, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[int] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: Optional[bool] = False, - **model_kwargs, - ) -> Union[GreedySearchOutput, torch.LongTensor]: - # init values - logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() - stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - if max_length is not None: - warnings.warn( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", - UserWarning, - ) - stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) - pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id - output_scores = output_scores if output_scores is not None else self.config.output_scores - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - decoder_attentions = () if (return_dict_in_generate and output_attentions) else None - cross_attentions = () if (return_dict_in_generate and output_attentions) else None - decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None - ) - - # keep track of which sequences are already finished - unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) - num_generated_tokens = input_ids.new(input_ids.shape[0]).fill_(0) - cur_len = input_ids.shape[-1] - - this_peer_finished = False # used by synced_gpus only - while True: - - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break - - # prepare model inputs - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - # forward pass to get next token - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - if synced_gpus and this_peer_finished: - cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need - - next_token_logits = outputs.logits[:, -1, :] - - # pre-process distribution - next_tokens_scores = logits_processor(input_ids, next_token_logits) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_tokens_scores,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # argmax - next_tokens = torch.argmax(next_tokens_scores, dim=-1) - - # finished sentences should have their next token be a padding token - if eos_token_id is not None: - if pad_token_id is None: - raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") - next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) - num_generated_tokens += unfinished_sequences - - # update generated ids, model inputs, and length for next step - input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder - ) - cur_len = cur_len + 1 - - # if eos_token was found in one sentence, set sentence to finished - if eos_token_id is not None: - unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long()) - - # stop when each sentence is finished, or if we exceed the maximum length - if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): - if not synced_gpus: - break - else: - this_peer_finished = True - - if return_dict_in_generate: - if self.config.is_encoder_decoder: - return GreedySearchEncoderDecoderOutput( - sequences=input_ids, - num_generated_tokens=num_generated_tokens, - scores=scores, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return GreedySearchDecoderOnlyOutput( - sequences=input_ids, - num_generated_tokens=num_generated_tokens, - scores=scores, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return input_ids - - def sample( - self, - input_ids: torch.LongTensor, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_warper: Optional[LogitsProcessorList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[int] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: Optional[bool] = False, - **model_kwargs, - ) -> Union[SampleOutput, torch.LongTensor]: - # init values - logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() - stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - if max_length is not None: - warnings.warn( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.", - UserWarning, - ) - stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) - logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList() - pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id - output_scores = output_scores if output_scores is not None else self.config.output_scores - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - decoder_attentions = () if (return_dict_in_generate and output_attentions) else None - cross_attentions = () if (return_dict_in_generate and output_attentions) else None - decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None - ) - - # keep track of which sequences are already finished - unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) - num_generated_tokens = input_ids.new(input_ids.shape[0]).fill_(0) - cur_len = input_ids.shape[-1] - - this_peer_finished = False # used by synced_gpus only - # auto-regressive generation - while True: - - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break - - # prepare model inputs - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - # forward pass to get next token - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - if synced_gpus and this_peer_finished: - cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need - - next_token_logits = outputs.logits[:, -1, :] - - # pre-process distribution - next_token_scores = logits_processor(input_ids, next_token_logits) - next_token_scores = logits_warper(input_ids, next_token_scores) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_token_scores,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # sample - probs = nn.functional.softmax(next_token_scores, dim=-1) - next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) - - # finished sentences should have their next token be a padding token - if eos_token_id is not None: - if pad_token_id is None: - raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") - next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) - num_generated_tokens += unfinished_sequences - - # update generated ids, model inputs, and length for next step - input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder - ) - cur_len = cur_len + 1 - - # if eos_token was found in one sentence, set sentence to finished - if eos_token_id is not None: - unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long()) - - # stop when each sentence is finished, or if we exceed the maximum length - if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): - if not synced_gpus: - break - else: - this_peer_finished = True - - if return_dict_in_generate: - if self.config.is_encoder_decoder: - return SampleEncoderDecoderOutput( - sequences=input_ids, - num_generated_tokens=num_generated_tokens, - scores=scores, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return SampleDecoderOnlyOutput( - sequences=input_ids, - num_generated_tokens=num_generated_tokens, - scores=scores, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return input_ids diff --git a/inference_server/utils/utils.py b/inference_server/utils/utils.py index 7a674a2..299b554 100644 --- a/inference_server/utils/utils.py +++ b/inference_server/utils/utils.py @@ -11,7 +11,7 @@ import torch import torch.distributed as dist -from ..constants import DS_INFERENCE, DS_INFERENCE_BLOOM_FP16, DS_INFERENCE_BLOOM_INT8, DS_ZERO, HF_ACCELERATE +from ..constants import DS_INFERENCE, DS_ZERO, HF_ACCELERATE # used for benchmarks @@ -67,7 +67,6 @@ def parse_args(parser: argparse.ArgumentParser) -> argparse.Namespace: args.dtype = get_torch_dtype(args.dtype) args.generate_kwargs = json.loads(args.generate_kwargs) - args.use_pre_sharded_checkpoints = args.model_name in [DS_INFERENCE_BLOOM_FP16, DS_INFERENCE_BLOOM_INT8] return args diff --git a/ui.py b/ui.py index fc3c176..b557c1d 100644 --- a/ui.py +++ b/ui.py @@ -3,7 +3,7 @@ import requests from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import HTMLResponse +from fastapi.responses import HTMLResponse, JSONResponse from fastapi.routing import APIRoute, Mount from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates @@ -48,9 +48,9 @@ def __init__(self, args: argparse.Namespace): def homepage(self, request: Request) -> HTMLResponse: return self.templates.TemplateResponse("index.html", {"request": request}) - def generate(self, request: dict) -> str: + def generate(self, request: dict) -> JSONResponse: response = requests.post(f"http://{self.server_host}:{self.server_port}/generate", json=request, verify=False) - return response.json() + return JSONResponse(content=response.json()) def run(self): # get around CORS