Skip to content

Dev/inferentia #61

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 23 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/workflows/build-container.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,16 @@ jobs:
TAILSCALE_AUTHKEY: ${{ secrets.TAILSCALE_AUTHKEY }}
REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
starlette-pytorch-neuron:
uses: ./.github/workflows/docker-build-action.yaml
with:
image: inference-pytorch-neuron
dockerfile: dockerfiles/pytorch/Dockerfile
build_args: "BASE_IMAGE=ubuntu:22.04,NEURONX=1"
secrets:
TAILSCALE_AUTHKEY: ${{ secrets.TAILSCALE_AUTHKEY }}
REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
starlette-tensorflow-cpu:
uses: ./.github/workflows/docker-build-action.yaml
with:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/docker-build-action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ jobs:
context: ${{ inputs.context }}
build-args: ${{ inputs.build_args }}
file: ${{ inputs.context }}/${{ inputs.dockerfile }}
tags: ${{ inputs.repository }}/${{ inputs.image }}:sha-${{ env.GITHUB_SHA_SHORT }},${{ inputs.repository }}/${{ inputs.image }}:latest

# tags: ${{ inputs.repository }}/${{ inputs.image }}:sha-${{ env.GITHUB_SHA_SHORT }},${{ inputs.repository }}/${{ inputs.image }}:latest
tags: ${{ inputs.repository }}/${{ inputs.image }}:testraph
- name: Tailscale Wait
if: ${{ failure() || runner.debug == '1' }}
uses: huggingface/tailscale-action@v1
Expand Down
11 changes: 7 additions & 4 deletions dockerfiles/pytorch/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
ARG BASE_IMAGE=nvidia/cuda:12.1.0-devel-ubuntu22.04

FROM $BASE_IMAGE

ARG NEURONX=0

SHELL ["/bin/bash", "-c"]

LABEL maintainer="Hugging Face"
Expand Down Expand Up @@ -31,12 +34,12 @@ RUN apt-get update && \
libsndfile1-dev \
ffmpeg \
&& apt-get clean autoremove --yes \
&& rm -rf /var/lib/{apt,dpkg,cache,log}
&& rm -rf /var/lib/{apt,cache,log}

# Copying only necessary files as filtered by .dockerignore
COPY . .

# install wheel and setuptools
RUN pip install --no-cache-dir -U pip ".[torch, st, diffusers]"
RUN if [[ "$NEURONX" == "1" ]];then /bin/bash -c "./dockerfiles/pytorch/neuronx.sh";else pip install --no-cache-dir -U pip ".[torch, st, diffusers]";fi

# copy application
COPY src/huggingface_inference_toolkit huggingface_inference_toolkit
Expand All @@ -45,4 +48,4 @@ COPY src/huggingface_inference_toolkit/webservice_starlette.py webservice_starle
# copy entrypoint and change permissions
COPY --chmod=0755 scripts/entrypoint.sh entrypoint.sh

ENTRYPOINT ["bash", "-c", "./entrypoint.sh"]
ENTRYPOINT ["bash", "-c", "./entrypoint.sh"]
41 changes: 41 additions & 0 deletions dockerfiles/pytorch/neuronx.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

set -e

# Install system prerequisites
apt-get update -y \
&& apt-get install -y --no-install-recommends \
gnupg2 \
wget

. /etc/os-release
tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -

apt-get update -y \
&& apt-get install -y --no-install-recommends \
aws-neuronx-dkms=2.* \
aws-neuronx-collectives=2.* \
aws-neuronx-runtime-lib=2.* \
aws-neuronx-tools=2.*

pip install -U pip

# Taken from optimum neuron, tgi dockerfile
pip3 install \
neuronx-cc==2.13.66.0 \
torch-neuronx==2.1.2.2.1.0 \
transformers-neuronx==0.10.0.21 \
--extra-index-url=https://pip.repos.neuron.amazonaws.com

pip3 install --extra-index-url=https://pip.repos.neuron.amazonaws.com optimum[neuronx,diffusers]

pip install ".[st,torch-neuronx]"

apt-get clean autoremove --yes

rm -rf /var/lib/{apt,cache,log} fi

echo "export PATH=\"$PATH:/opt/aws/neuron/bin\"" >> /root/.bashrc
5 changes: 4 additions & 1 deletion makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,8 @@ inference-pytorch-gpu:
inference-pytorch-cpu:
docker build --build-arg="BASE_IMAGE=ubuntu:22.04" -f dockerfiles/pytorch/Dockerfile -t integration-test-pytorch:cpu .

inference-pytorch-neuron:
docker build --build-arg=BASE_IMAGE=ubuntu:22.04 --build-arg=NEURONX=1 -f dockerfiles/pytorch/Dockerfile -t integration-test-pytorch:neuron .

stop-all:
docker stop $$(docker ps -a -q) && docker container prune --force
docker stop $$(docker ps -a -q) && docker container prune --force
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
"wheel==0.42.0",
"setuptools==69.1.0",
"cmake==3.28.3",
"transformers[sklearn,sentencepiece, audio, vision]==4.38.2",
"huggingface_hub==0.20.3",
"transformers[sklearn,sentencepiece, audio, vision]>=4.38.2",
"huggingface_hub==0.23.0",
"orjson",
# vision
"Pillow",
Expand All @@ -39,6 +39,8 @@
extras["st"] = ["sentence_transformers==2.4.0"]
extras["diffusers"] = ["diffusers==0.26.3", "accelerate==0.27.2"]
extras["torch"] = ["torch==2.2.0", "torchvision", "torchaudio"]
# For neuronx
extras["torch-neuronx"] = ["torch-neuronx", "torchvision", "torchaudio"]
extras["tensorflow"] = ["tensorflow"]
extras["test"] = [
"pytest==7.2.1",
Expand Down
100 changes: 97 additions & 3 deletions src/huggingface_inference_toolkit/diffusers_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
import importlib.util
import json
import logging
import os

from transformers.utils.import_utils import is_torch_bf16_gpu_available

logger = logging.getLogger(__name__)
logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO)

_diffusers = importlib.util.find_spec("diffusers") is not None
_optimum = importlib.util.find_spec("optimum") is not None
if _optimum:
_optimum_neuron = importlib.util.find_spec("optimum.neuron") is not None
else:
_optimum_neuron = False


def is_diffusers_available():
Expand All @@ -18,6 +25,10 @@ def is_diffusers_available():
from diffusers import AutoPipelineForText2Image, DPMSolverMultistepScheduler, StableDiffusionPipeline


if _optimum_neuron:
from optimum import neuron


class IEAutoPipelineForText2Image:
def __init__(self, model_dir: str, device: str = None): # needs "cuda" for GPU
dtype = torch.float32
Expand Down Expand Up @@ -55,8 +66,91 @@ def __call__(
}


def get_diffusers_pipeline(task=None, model_dir=None, device=-1, **kwargs):
def _is_neuron_model(model_dir):
for root, _, files in os.walk(model_dir):
for f in files:
if f == "config.json":
filename = os.path.join(root, f)
with open(filename, 'r') as fh:
try:
config = json.load(fh)
except Exception as e:
logger.warning("Unable to load config %s properly, skipping", filename)
logger.exception(e)
continue
if 'neuron' in config.keys():
return True
return False


def neuron_diffusion_pipeline(task: str, model_dir: str):

# Step 1: load config and look for _class_name
try:
config = StableDiffusionPipeline.load_config(pretrained_model_name_or_path=model_dir)
except OSError as e:
logger.error("Unable to load config file for repository %s", model_dir)
logger.exception(e)
raise

pipeline_class_name = config['_class_name']

logger.debug("Repository pipeline class name %s", pipeline_class_name)
if "Diffusion" in pipeline_class_name and "XL" in pipeline_class_name:
if task == "image-to-image":
pipeline_class = neuron.NeuronStableDiffusionXLImg2ImgPipeline
else:
pipeline_class = neuron.NeuronStableDiffusionXLPipeline
else:
if task == "image-to-image":
pipeline_class = neuron.NeuronStableDiffusionImg2ImgPipeline
else:
pipeline_class = neuron.NeuronStableDiffusionPipeline

logger.debug("Pipeline class %s", pipeline_class.__class__)

compiler_args = {
"auto_cast": "matmul",
"auto_cast_type": "bf16",
"inline_weights_to_neff": os.environ.get("INLINE_WEIGHTS_TO_NEFF",
"false").lower() in ["false", "no", "0"],
"data_parallel_mode": os.environ.get("DATA_PARALLEL_MODE", "unet")
}
input_shapes = {"batch_size": 1,
"height": int(os.environ.get("IMAGE_HEIGHT", 512)),
"width": int(os.environ.get("IMAGE_WIDTH", 512))}
export_kwargs = {**compiler_args, **input_shapes, "export": True}

# if is neuron model, no need for additional kwargs, any info lies within the repo
is_neuron_m = _is_neuron_model(model_dir)
if is_neuron_m:
kwargs = {}
fallback_kwargs = export_kwargs
else:
kwargs = export_kwargs
fallback_kwargs = {}

# In the second case, exporting can take a huge amount of time, which makes endpoints not a really suited solution
# at least as long as the cache is not really an option for diffusion
try:
logger.info("Loading model %s with kwargs %s", model_dir, kwargs)
return pipeline_class.from_pretrained(model_dir, **kwargs)
except Exception as e:
logger.error("Unable to load model %s properly falling back to kwargs %s", model_dir, fallback_kwargs)
logger.exception(e)
return pipeline_class.from_pretrained(model_dir, **fallback_kwargs)


def get_diffusers_pipeline(task=None, model_dir=None, device=-1, **_kwargs):
"""Get a pipeline for Diffusers models."""
device = "cuda" if device == 0 else "cpu"
pipeline = DIFFUSERS_TASKS[task](model_dir=model_dir, device=device)
if device == 0:
device = "cuda"
elif device is not None:
device = "cpu"
# None case: neuronx, no need to specify device

if device is not None:
pipeline = DIFFUSERS_TASKS[task](model_dir=model_dir, device=device)
else:
pipeline = neuron_diffusion_pipeline(task=task, model_dir=model_dir)
return pipeline
19 changes: 18 additions & 1 deletion src/huggingface_inference_toolkit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@
import torch

_optimum_available = importlib.util.find_spec("optimum") is not None
if _optimum_available:
_optimum_neuron = importlib.util.find_spec("optimum.neuron") is not None
from optimum.neuron.modeling_decoder import get_available_cores as get_neuron_cores
else:
_optimum_neuron = False

def get_neuron_cores():
return 0


def is_optimum_available():
Expand All @@ -38,6 +46,10 @@ def is_optimum_available():
# return _optimum_available


def is_optimum_neuron_available():
return _optimum_neuron


framework2weight = {
"pytorch": "pytorch*",
"tensorflow": "tf*",
Expand Down Expand Up @@ -215,6 +227,8 @@ def get_device():

if gpu:
return 0
elif get_neuron_cores() > 0:
return None
else:
return -1

Expand All @@ -229,7 +243,10 @@ def get_pipeline(
create pipeline class for a specific task based on local saved model
"""
device = get_device()
logger.info(f"Using device { 'GPU' if device == 0 else 'CPU'}")
logger.info(f"Using device { 'GPU' if device == 0 else 'Neuron' if device is None else 'CPU'}")

if device is None and task != "text-to-image":
raise Exception("This container only supports text-to-image task with neurons")

if task is None:
raise EnvironmentError(
Expand Down
7 changes: 4 additions & 3 deletions src/huggingface_inference_toolkit/webservice_starlette.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import os
from pathlib import Path
from time import perf_counter

Expand All @@ -23,15 +24,15 @@


def config_logging(level=logging.INFO):
logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", datefmt="", level=level)
logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", datefmt="", level=level, force=True)
# disable uvicorn access logs to hide /health
uvicorn_access = logging.getLogger("uvicorn.access")
uvicorn_access.disabled = True
# remove double logs for errors
logging.getLogger("uvicorn").removeHandler(logging.getLogger("uvicorn").handlers[0])


config_logging()
config_logging(os.environ.get("LOG_LEVEL", logging.getLevelName(logging.INFO)))
logger = logging.getLogger(__name__)


Expand All @@ -50,7 +51,7 @@ async def some_startup_task():
else:
raise ValueError(
f"""Can't initialize model.
Please set env HF_MODEL_DIR or provider a HF_MODEL_ID.
Please set env HF_MODEL_DIR or provide a HF_MODEL_ID.
Provided values are:
HF_MODEL_DIR: {HF_MODEL_DIR} and HF_MODEL_ID:{HF_MODEL_ID}"""
)
Expand Down
Loading