From e8189816815f971946880616fc18c9330329dcc9 Mon Sep 17 00:00:00 2001 From: SHIHONGHAO <13820618441@163.com> Date: Thu, 10 Aug 2023 16:59:55 +0800 Subject: [PATCH] stable diffusion stdcase (#191) * bert * fix * add * add MFU * vit * addsrc * sd --- .../stable_diffusion_v1_4/README.md | 60 + .../stable_diffusion_v1_4/pytorch/__init__.py | 5 + .../pytorch/dataloader.py | 31 + .../pytorch/evaluator.py | 12 + .../stable_diffusion_v1_4/pytorch/export.py | 41 + .../stable_diffusion_v1_4/pytorch/forward.py | 251 ++++ .../stable_diffusion_v1_4/pytorch/model.py | 16 + .../pytorch/model_utils/unet2d.py | 1064 +++++++++++++++++ .../pytorch/requirements.txt | 3 + inference/benchmarks/vit_l_16/README.md | 86 ++ .../benchmarks/vit_l_16/pytorch/__init__.py | 5 + .../benchmarks/vit_l_16/pytorch/dataloader.py | 49 + .../benchmarks/vit_l_16/pytorch/evaluator.py | 10 + .../benchmarks/vit_l_16/pytorch/export.py | 34 + .../benchmarks/vit_l_16/pytorch/forward.py | 106 ++ .../benchmarks/vit_l_16/pytorch/model.py | 14 + .../vit_l_16/pytorch/requirements.txt | 1 + .../stable_diffusion_v1_4/configurations.yaml | 16 + .../stable_diffusion_v1_4/parameters.yaml | 14 + .../vendor_config/nvidia_configurations.yaml | 3 + .../configs/vit_l_16/configurations.yaml | 16 + inference/configs/vit_l_16/parameters.yaml | 1 + .../vendor_config/nvidia_configurations.yaml | 3 + 23 files changed, 1841 insertions(+) create mode 100644 inference/benchmarks/stable_diffusion_v1_4/README.md create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/forward.py create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/model.py create mode 100755 inference/benchmarks/stable_diffusion_v1_4/pytorch/model_utils/unet2d.py create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/requirements.txt create mode 100644 inference/benchmarks/vit_l_16/README.md create mode 100644 inference/benchmarks/vit_l_16/pytorch/__init__.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/dataloader.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/evaluator.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/export.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/forward.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/model.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/requirements.txt create mode 100644 inference/configs/stable_diffusion_v1_4/configurations.yaml create mode 100644 inference/configs/stable_diffusion_v1_4/parameters.yaml create mode 100644 inference/configs/stable_diffusion_v1_4/vendor_config/nvidia_configurations.yaml create mode 100644 inference/configs/vit_l_16/configurations.yaml create mode 100644 inference/configs/vit_l_16/parameters.yaml create mode 100644 inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml diff --git a/inference/benchmarks/stable_diffusion_v1_4/README.md b/inference/benchmarks/stable_diffusion_v1_4/README.md new file mode 100644 index 000000000..07aade914 --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/README.md @@ -0,0 +1,60 @@ +### 1. 推理数据集 + + +### 2. 模型与权重 + +* 模型实现 + * pytorch:transformers.UNet2DConditionalModel +* 权重下载 + * pytorch:from_pretrained("CompViz/stable-diffusion-v1-4") + +### 2. 软硬件配置与运行信息参考 + +#### 2.1 Nvidia A100 + +- ##### 硬件环境 + - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-113-generic + - 加速卡驱动版本:470.129.06 + - Docker 版本:20.10.16 + - 训练框架版本:pytorch-2.1.0a0+4136153 + - 依赖软件版本: + - cuda: 12.1 + +- 推理工具包 + + - TensorRT 8.6.1 + +- 其他说明 + + - 本case在大批尺寸情况下涉及到了张量超过4B的情况,因此在大批尺寸离线批推理场景下,不宜作为性能及MFU基准。 + +### 3. 运行情况 + +* 指标列表 + +| 指标名称 | 指标值索引 | 特殊说明 | +| ------------------ | ---------------- | -------------------------------------------- | +| 数据精度 | precision | 可选fp32/fp16 | +| 批尺寸 | bs | | +| 硬件存储使用 | mem | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time | 总时间+Perf初始化等时间 | +| 验证总吞吐量 | p_val_whole | 实际验证prompts数除以总验证时间 | +| 验证计算吞吐量 | p_val_core | 不包含IO部分耗时 | +| 推理总吞吐量 | p_infer_whole | 实际推理prompts数除以总推理时间 | +| **推理计算吞吐量** | **\*p_infer_core** | 不包含IO部分耗时 | +| **计算卡使用率** | **\*MFU** | model flops utilization | +| 推理结果 | CLIP Score(推理/验证) | 单位为text2img耦合度分数 | + +* 指标值 + +| 推理工具 | precision | bs | e2e_time | p_val_whole | p_val_core | p_infer_whole | \*p_infer_core | \*MFU | CLIP Score | mem | +| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | +| tensorrt | fp16 | 2 |1674.9 | 11.4 | 45.2 | 10.6 | 60.6 | 13.2% | 17.1/25.2 | 13.3/40.0 | +| tensorrt | fp32 | 2 | 1807.4 | 8.2 | 20.6 | 7.2 | 16.1 | 7.0% | 25.2/25.3 | 39.2/40.0 | +| null | fp16 | 16 | / | 11.7 | 60.7 | / | / | 13.2% | -/25.2 | 5.7/40.0 | +| null | fp32 | 8 | / | 9.3 | 27.3 | / | / | 11.9% | -/25.3 | 6.3/40.0 | diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py new file mode 100644 index 000000000..1f6cdf49b --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py @@ -0,0 +1,5 @@ +from .dataloader import build_dataloader +from .model import create_model +from .export import export_model +from .evaluator import evaluator +from .forward import model_forward, engine_forward diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py new file mode 100644 index 000000000..94f00f2f2 --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py @@ -0,0 +1,31 @@ +from torch.utils.data import DataLoader as dl +import torch +import json +import random + + +def build_dataset(config): + + df = json.load(open(config.data_dir + "/" + config.prompts))["annotations"] + prompts = [] + for item in df: + prompts.append(item["caption"]) + dataset = [ + item for item in prompts if len(item) < config.prompt_max_len - 2 + ] + random.seed(config.random_seed) + dataset = random.sample(dataset, config.prompt_samples) + + return dataset + + +def build_dataloader(config): + dataset = build_dataset(config) + loader = dl(dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=True, + num_workers=config.num_workers, + pin_memory=True) + + return loader diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py new file mode 100644 index 000000000..824323809 --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py @@ -0,0 +1,12 @@ +import torch + + +def evaluator(metric, image, prompt, config): + scores = [] + image = (image / 2 + 0.5).clamp(0, 1) + image = image.detach().cpu().permute(0, 2, 3, 1).numpy() + image = (image * 255).round().astype("uint8") + image = torch.tensor(image) + for i in range(config.batch_size): + scores.append(float(metric(image[i], prompt[i]))) + return scores diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py new file mode 100644 index 000000000..60fa8fbb8 --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py @@ -0,0 +1,41 @@ +import torch +import os + + +def export_model(model, config): + if config.exist_onnx_path is not None: + return config.exist_onnx_path + + filename = config.case + "_bs" + str(config.batch_size) + filename = filename + "_" + str(config.framework) + filename = filename + "_fp16" + str(config.fp16) + filename = "onnxs/" + filename + ".onnx" + onnx_path = config.perf_dir + "/" + filename + + latent = torch.randn(config.batch_size * 2, config.in_channels, + config.height // config.scale_size, + config.width // config.scale_size).cuda().float() + t = torch.randn([]).cuda().int() + embed = torch.randn(config.batch_size * 2, config.prompt_max_len, + config.embed_hidden_size).cuda().float() + + if config.fp16: + latent = latent.half() + embed = embed.half() + + dummy_input = (latent, t, embed) + + dir_onnx_path = os.path.dirname(onnx_path) + os.makedirs(dir_onnx_path, exist_ok=True) + + with torch.no_grad(): + torch.onnx.export(model, + dummy_input, + onnx_path, + verbose=False, + input_names=["input_0", "input_1", "input_2"], + output_names=["output_0"], + training=torch.onnx.TrainingMode.EVAL, + do_constant_folding=True) + + return onnx_path diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/forward.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/forward.py new file mode 100644 index 000000000..a9314a90a --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/forward.py @@ -0,0 +1,251 @@ +from loguru import logger +import torch +import numpy as np +import time +from tools import torch_sync +from diffusers import AutoencoderKL, UNet2DConditionModel, DDIMScheduler +from transformers import CLIPTextModel, CLIPTokenizer +from torchmetrics.multimodal import CLIPScore + + +def cal_perf(config, dataloader_len, duration, core_time, str_prefix): + model_forward_perf = config.repeat * dataloader_len * config.batch_size * config.num_inference_steps / duration + logger.info(str_prefix + "(" + config.framework + ") Perf: " + + str(model_forward_perf) + " ips") + model_forward_core_perf = config.repeat * dataloader_len * config.batch_size * config.num_inference_steps / core_time + logger.info(str_prefix + "(" + config.framework + ") core Perf: " + + str(model_forward_core_perf) + " ips") + return round(model_forward_perf, 3), round(model_forward_core_perf, 3) + + +def model_forward(model, dataloader, evaluator, config): + if config.no_validation: + return None, None, None + vae = AutoencoderKL.from_pretrained(config.data_dir + "/" + config.weights, + subfolder="vae") + tokenizer = CLIPTokenizer.from_pretrained(config.data_dir + "/" + + config.weights, + subfolder="tokenizer") + text_encoder = CLIPTextModel.from_pretrained(config.data_dir + "/" + + config.weights, + subfolder="text_encoder") + noise_scheduler = DDIMScheduler( + num_train_timesteps=config.num_train_timesteps, + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, + ) + vae.eval() + text_encoder.eval() + + metric = CLIPScore(model_name_or_path=config.data_dir + "/" + + config.eval_weights) + metric.eval() + + generator = torch.Generator().manual_seed(config.random_seed) + + start = time.time() + core_time = 0.0 + scores = [] + for times in range(config.repeat): + + logger.debug("Repeat: " + str(times + 1)) + for step, prompt in enumerate(dataloader): + if step % config.log_freq == 0: + logger.debug("Step: " + str(step) + " / " + + str(len(dataloader))) + + with torch.no_grad(): + text_input = tokenizer(prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt") + + text_embeddings = text_encoder(text_input.input_ids)[0] + + max_length = text_input.input_ids.shape[-1] + uncond_input = tokenizer([""] * config.batch_size, + padding="max_length", + max_length=max_length, + return_tensors="pt") + + uncond_embeddings = text_encoder(uncond_input.input_ids)[0] + text_embeddings = torch.cat( + [uncond_embeddings, text_embeddings]) + + latents = torch.randn( + (config.batch_size, config.in_channels, config.height // + config.scale_size, config.width // config.scale_size), + generator=generator) + + noise_scheduler.set_timesteps(config.num_inference_steps) + + timesteps_tensor = torch.linspace( + config.num_train_timesteps - + config.num_train_timesteps // config.num_inference_steps, + 0, config.num_inference_steps).int() + + for t in timesteps_tensor: + latent_model_input = torch.cat([latents] * 2) + + torch_sync(config) + core_time_start = time.time() + if config.fp16: + noise_pred = model( + latent_model_input.cuda().to(torch.float16), + t.cuda(), + text_embeddings.cuda().to(torch.float16)) + else: + noise_pred = model(latent_model_input.cuda(), t.cuda(), + text_embeddings.cuda()) + + torch_sync(config) + core_time += time.time() - core_time_start + + noise_pred = noise_pred.to(torch.float32).cpu() + + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + config.guidance_scale * ( + noise_pred_text - noise_pred_uncond) + + latents = noise_scheduler.step(noise_pred, t, + latents).prev_sample + + latents = 1 / 0.18215 * latents + image = vae.decode(latents).sample + + scores_iter = evaluator(metric, image, prompt, config) + for score in scores_iter: + scores.append(score) + + duration = time.time() - start + logger.info("CLIP Scores: " + str(np.mean(scores))) + + duration = time.time() - start + model_forward_perf, model_forward_core_perf = cal_perf( + config, len(dataloader), duration, core_time, "Validation") + + return model_forward_perf, model_forward_core_perf, round( + float(np.mean(scores)), 3) + + +def engine_forward(model, dataloader, evaluator, config): + vae = AutoencoderKL.from_pretrained(config.data_dir + "/" + config.weights, + subfolder="vae") + tokenizer = CLIPTokenizer.from_pretrained(config.data_dir + "/" + + config.weights, + subfolder="tokenizer") + text_encoder = CLIPTextModel.from_pretrained(config.data_dir + "/" + + config.weights, + subfolder="text_encoder") + noise_scheduler = DDIMScheduler( + num_train_timesteps=config.num_train_timesteps, + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, + ) + vae.eval() + text_encoder.eval() + + metric = CLIPScore(model_name_or_path=config.data_dir + "/" + + config.eval_weights) + metric.eval() + + generator = torch.Generator().manual_seed(config.random_seed) + + start = time.time() + core_time = 0.0 + scores = [] + for times in range(config.repeat): + + logger.debug("Repeat: " + str(times + 1)) + for step, prompt in enumerate(dataloader): + if step % config.log_freq == 0: + logger.debug("Step: " + str(step) + " / " + + str(len(dataloader))) + + with torch.no_grad(): + text_input = tokenizer(prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt") + + text_embeddings = text_encoder(text_input.input_ids)[0] + + max_length = text_input.input_ids.shape[-1] + uncond_input = tokenizer([""] * config.batch_size, + padding="max_length", + max_length=max_length, + return_tensors="pt") + + uncond_embeddings = text_encoder(uncond_input.input_ids)[0] + text_embeddings = torch.cat( + [uncond_embeddings, text_embeddings]) + + latents = torch.randn( + (config.batch_size, config.in_channels, config.height // + config.scale_size, config.width // config.scale_size), + generator=generator) + + noise_scheduler.set_timesteps(config.num_inference_steps) + + timesteps_tensor = torch.linspace( + config.num_train_timesteps - + config.num_train_timesteps // config.num_inference_steps, + 0, config.num_inference_steps).int() + + for t in timesteps_tensor: + latent_model_input = torch.cat([latents] * 2) + + inputs = [latent_model_input, t, text_embeddings] + if config.fp16: + inputs = [ + latent_model_input.to(torch.float16), t, + text_embeddings.to(torch.float16) + ] + + torch_sync(config) + core_time_start = time.time() + outputs = model(inputs) + noise_pred = outputs[0] + foo_time = outputs[1] + + torch_sync(config) + core_time += time.time() - core_time_start + + noise_pred = noise_pred[0].float() + noise_pred = noise_pred.reshape( + config.batch_size * 2, config.in_channels, + config.height // config.scale_size, + config.width // config.scale_size) + noise_pred = noise_pred.cpu() + + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + config.guidance_scale * ( + noise_pred_text - noise_pred_uncond) + + latents = noise_scheduler.step(noise_pred, t, + latents).prev_sample + + latents = 1 / 0.18215 * latents + image = vae.decode(latents).sample + + scores_iter = evaluator(metric, image, prompt, config) + for score in scores_iter: + scores.append(score) + + duration = time.time() - start + logger.info("CLIP Scores: " + str(np.mean(scores))) + + duration = time.time() - start + model_forward_perf, model_forward_core_perf = cal_perf( + config, len(dataloader), duration, core_time, "Inference") + + return model_forward_perf, model_forward_core_perf, round( + float(np.mean(scores)), 3) diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/model.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/model.py new file mode 100644 index 000000000..e1b4db5cb --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/model.py @@ -0,0 +1,16 @@ +from .model_utils.unet2d import UNet2DConditionModel + + +def create_model(config): + if config.no_validation: + assert config.exist_onnx_path is not None + return None + model = UNet2DConditionModel.from_pretrained(config.data_dir + "/" + + config.weights, + subfolder="unet") + model.cuda() + model.eval() + if config.fp16: + model.half() + + return model diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/model_utils/unet2d.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/model_utils/unet2d.py new file mode 100755 index 000000000..cc803a9f3 --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/model_utils/unet2d.py @@ -0,0 +1,1064 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.loaders import UNet2DConditionLoadersMixin +from diffusers.utils import BaseOutput, logging +from diffusers.models.activations import get_activation +from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor +from diffusers.models.embeddings import ( + GaussianFourierProjection, + ImageHintTimeEmbedding, + ImageProjection, + ImageTimeEmbedding, + TextImageProjection, + TextImageTimeEmbedding, + TextTimeEmbedding, + TimestepEmbedding, + Timesteps, +) +from diffusers.models.modeling_utils import ModelMixin +from diffusers.models.unet_2d_blocks import ( + CrossAttnDownBlock2D, + CrossAttnUpBlock2D, + DownBlock2D, + UNetMidBlock2DCrossAttn, + UNetMidBlock2DSimpleCrossAttn, + UpBlock2D, + get_down_block, + get_up_block, +) + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class UNet2DConditionOutput(BaseOutput): + """ + The output of [`UNet2DConditionModel`]. + + Args: + sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. + """ + + sample: torch.FloatTensor = None + + +class UNet2DConditionModel(ModelMixin, ConfigMixin, + UNet2DConditionLoadersMixin): + r""" + A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample + shaped output. + + This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented + for all models (such as downloading or saving). + + Parameters: + sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`): + Height and width of input/output sample. + in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample. + out_channels (`int`, *optional*, defaults to 4): Number of channels in the output. + center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample. + flip_sin_to_cos (`bool`, *optional*, defaults to `False`): + Whether to flip the sin to cos in the time embedding. + freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding. + down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`): + The tuple of downsample blocks to use. + mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`): + Block type for middle of UNet, it can be either `UNetMidBlock2DCrossAttn` or + `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped. + up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`): + The tuple of upsample blocks to use. + only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`): + Whether to include self-attention in the basic transformer blocks, see + [`~models.attention.BasicTransformerBlock`]. + block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`): + The tuple of output channels for each block. + layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block. + downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution. + mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block. + act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. + norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization. + If `None`, normalization and activation layers is skipped in post-processing. + norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization. + cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280): + The dimension of the cross attention features. + transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1): + The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for + [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`], + [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`]. + encoder_hid_dim (`int`, *optional*, defaults to None): + If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim` + dimension to `cross_attention_dim`. + encoder_hid_dim_type (`str`, *optional*, defaults to `None`): + If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text + embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`. + attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads. + num_attention_heads (`int`, *optional*): + The number of attention heads. If not defined, defaults to `attention_head_dim` + resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config + for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`. + class_embed_type (`str`, *optional*, defaults to `None`): + The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`, + `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`. + addition_embed_type (`str`, *optional*, defaults to `None`): + Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or + "text". "text" will use the `TextTimeEmbedding` layer. + addition_time_embed_dim: (`int`, *optional*, defaults to `None`): + Dimension for the timestep embeddings. + num_class_embeds (`int`, *optional*, defaults to `None`): + Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing + class conditioning with `class_embed_type` equal to `None`. + time_embedding_type (`str`, *optional*, defaults to `positional`): + The type of position embedding to use for timesteps. Choose from `positional` or `fourier`. + time_embedding_dim (`int`, *optional*, defaults to `None`): + An optional override for the dimension of the projected time embedding. + time_embedding_act_fn (`str`, *optional*, defaults to `None`): + Optional activation function to use only once on the time embeddings before they are passed to the rest of + the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`. + timestep_post_act (`str`, *optional*, defaults to `None`): + The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`. + time_cond_proj_dim (`int`, *optional*, defaults to `None`): + The dimension of `cond_proj` layer in the timestep embedding. + conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. + conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer. + projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when + `class_embed_type="projection"`. Required when `class_embed_type="projection"`. + class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time + embeddings with the class embeddings. + mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`): + Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If + `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the + `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False` + otherwise. + """ + + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + sample_size: Optional[int] = None, + in_channels: int = 4, + out_channels: int = 4, + center_input_sample: bool = False, + flip_sin_to_cos: bool = True, + freq_shift: int = 0, + down_block_types: Tuple[str] = ( + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "DownBlock2D", + ), + mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn", + up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D"), + only_cross_attention: Union[bool, Tuple[bool]] = False, + block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + layers_per_block: Union[int, Tuple[int]] = 2, + downsample_padding: int = 1, + mid_block_scale_factor: float = 1, + act_fn: str = "silu", + norm_num_groups: Optional[int] = 32, + norm_eps: float = 1e-5, + cross_attention_dim: Union[int, Tuple[int]] = 1280, + transformer_layers_per_block: Union[int, Tuple[int]] = 1, + encoder_hid_dim: Optional[int] = None, + encoder_hid_dim_type: Optional[str] = None, + attention_head_dim: Union[int, Tuple[int]] = 8, + num_attention_heads: Optional[Union[int, Tuple[int]]] = None, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + class_embed_type: Optional[str] = None, + addition_embed_type: Optional[str] = None, + addition_time_embed_dim: Optional[int] = None, + num_class_embeds: Optional[int] = None, + upcast_attention: bool = False, + resnet_time_scale_shift: str = "default", + resnet_skip_time_act: bool = False, + resnet_out_scale_factor: int = 1.0, + time_embedding_type: str = "positional", + time_embedding_dim: Optional[int] = None, + time_embedding_act_fn: Optional[str] = None, + timestep_post_act: Optional[str] = None, + time_cond_proj_dim: Optional[int] = None, + conv_in_kernel: int = 3, + conv_out_kernel: int = 3, + projection_class_embeddings_input_dim: Optional[int] = None, + class_embeddings_concat: bool = False, + mid_block_only_cross_attention: Optional[bool] = None, + cross_attention_norm: Optional[str] = None, + addition_embed_type_num_heads=64, + ): + super().__init__() + + self.sample_size = sample_size + + if num_attention_heads is not None: + raise ValueError( + "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19." + ) + + # If `num_attention_heads` is not defined (which is the case for most models) + # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. + # The reason for this behavior is to correct for incorrectly named variables that were introduced + # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 + # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking + # which is why we correct for the naming here. + num_attention_heads = num_attention_heads or attention_head_dim + + # Check inputs + if len(down_block_types) != len(up_block_types): + raise ValueError( + f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}." + ) + + if len(block_out_channels) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." + ) + + if not isinstance( + only_cross_attention, + bool) and len(only_cross_attention) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." + ) + + if not isinstance( + num_attention_heads, + int) and len(num_attention_heads) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." + ) + + if not isinstance( + attention_head_dim, + int) and len(attention_head_dim) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}." + ) + + if isinstance( + cross_attention_dim, + list) and len(cross_attention_dim) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}." + ) + + if not isinstance( + layers_per_block, + int) and len(layers_per_block) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}." + ) + + # input + conv_in_padding = (conv_in_kernel - 1) // 2 + self.conv_in = nn.Conv2d(in_channels, + block_out_channels[0], + kernel_size=conv_in_kernel, + padding=conv_in_padding) + + # time + if time_embedding_type == "fourier": + time_embed_dim = time_embedding_dim or block_out_channels[0] * 2 + if time_embed_dim % 2 != 0: + raise ValueError( + f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}." + ) + self.time_proj = GaussianFourierProjection( + time_embed_dim // 2, + set_W_to_weight=False, + log=False, + flip_sin_to_cos=flip_sin_to_cos) + timestep_input_dim = time_embed_dim + elif time_embedding_type == "positional": + time_embed_dim = time_embedding_dim or block_out_channels[0] * 4 + + self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, + freq_shift) + timestep_input_dim = block_out_channels[0] + else: + raise ValueError( + f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`." + ) + + self.time_embedding = TimestepEmbedding( + timestep_input_dim, + time_embed_dim, + act_fn=act_fn, + post_act_fn=timestep_post_act, + cond_proj_dim=time_cond_proj_dim, + ) + + if encoder_hid_dim_type is None and encoder_hid_dim is not None: + encoder_hid_dim_type = "text_proj" + self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type) + logger.info( + "encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined." + ) + + if encoder_hid_dim is None and encoder_hid_dim_type is not None: + raise ValueError( + f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}." + ) + + if encoder_hid_dim_type == "text_proj": + self.encoder_hid_proj = nn.Linear(encoder_hid_dim, + cross_attention_dim) + elif encoder_hid_dim_type == "text_image_proj": + # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)` + self.encoder_hid_proj = TextImageProjection( + text_embed_dim=encoder_hid_dim, + image_embed_dim=cross_attention_dim, + cross_attention_dim=cross_attention_dim, + ) + elif encoder_hid_dim_type == "image_proj": + # Kandinsky 2.2 + self.encoder_hid_proj = ImageProjection( + image_embed_dim=encoder_hid_dim, + cross_attention_dim=cross_attention_dim, + ) + elif encoder_hid_dim_type is not None: + raise ValueError( + f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'." + ) + else: + self.encoder_hid_proj = None + + # class embedding + if class_embed_type is None and num_class_embeds is not None: + self.class_embedding = nn.Embedding(num_class_embeds, + time_embed_dim) + elif class_embed_type == "timestep": + self.class_embedding = TimestepEmbedding(timestep_input_dim, + time_embed_dim, + act_fn=act_fn) + elif class_embed_type == "identity": + self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim) + elif class_embed_type == "projection": + if projection_class_embeddings_input_dim is None: + raise ValueError( + "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set" + ) + # The projection `class_embed_type` is the same as the timestep `class_embed_type` except + # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings + # 2. it projects from an arbitrary input dimension. + # + # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations. + # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings. + # As a result, `TimestepEmbedding` can be passed arbitrary vectors. + self.class_embedding = TimestepEmbedding( + projection_class_embeddings_input_dim, time_embed_dim) + elif class_embed_type == "simple_projection": + if projection_class_embeddings_input_dim is None: + raise ValueError( + "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set" + ) + self.class_embedding = nn.Linear( + projection_class_embeddings_input_dim, time_embed_dim) + else: + self.class_embedding = None + + if addition_embed_type == "text": + if encoder_hid_dim is not None: + text_time_embedding_from_dim = encoder_hid_dim + else: + text_time_embedding_from_dim = cross_attention_dim + + self.add_embedding = TextTimeEmbedding( + text_time_embedding_from_dim, + time_embed_dim, + num_heads=addition_embed_type_num_heads) + elif addition_embed_type == "text_image": + # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)` + self.add_embedding = TextImageTimeEmbedding( + text_embed_dim=cross_attention_dim, + image_embed_dim=cross_attention_dim, + time_embed_dim=time_embed_dim) + elif addition_embed_type == "text_time": + self.add_time_proj = Timesteps(addition_time_embed_dim, + flip_sin_to_cos, freq_shift) + self.add_embedding = TimestepEmbedding( + projection_class_embeddings_input_dim, time_embed_dim) + elif addition_embed_type == "image": + # Kandinsky 2.2 + self.add_embedding = ImageTimeEmbedding( + image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim) + elif addition_embed_type == "image_hint": + # Kandinsky 2.2 ControlNet + self.add_embedding = ImageHintTimeEmbedding( + image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim) + elif addition_embed_type is not None: + raise ValueError( + f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'." + ) + + if time_embedding_act_fn is None: + self.time_embed_act = None + else: + self.time_embed_act = get_activation(time_embedding_act_fn) + + self.down_blocks = nn.ModuleList([]) + self.up_blocks = nn.ModuleList([]) + + if isinstance(only_cross_attention, bool): + if mid_block_only_cross_attention is None: + mid_block_only_cross_attention = only_cross_attention + + only_cross_attention = [only_cross_attention + ] * len(down_block_types) + + if mid_block_only_cross_attention is None: + mid_block_only_cross_attention = False + + if isinstance(num_attention_heads, int): + num_attention_heads = ( + num_attention_heads, ) * len(down_block_types) + + if isinstance(attention_head_dim, int): + attention_head_dim = (attention_head_dim, ) * len(down_block_types) + + if isinstance(cross_attention_dim, int): + cross_attention_dim = ( + cross_attention_dim, ) * len(down_block_types) + + if isinstance(layers_per_block, int): + layers_per_block = [layers_per_block] * len(down_block_types) + + if isinstance(transformer_layers_per_block, int): + transformer_layers_per_block = [transformer_layers_per_block + ] * len(down_block_types) + + if class_embeddings_concat: + # The time embeddings are concatenated with the class embeddings. The dimension of the + # time embeddings passed to the down, middle, and up blocks is twice the dimension of the + # regular time embeddings + blocks_time_embed_dim = time_embed_dim * 2 + else: + blocks_time_embed_dim = time_embed_dim + + # down + output_channel = block_out_channels[0] + for i, down_block_type in enumerate(down_block_types): + input_channel = output_channel + output_channel = block_out_channels[i] + is_final_block = i == len(block_out_channels) - 1 + + down_block = get_down_block( + down_block_type, + num_layers=layers_per_block[i], + transformer_layers_per_block=transformer_layers_per_block[i], + in_channels=input_channel, + out_channels=output_channel, + temb_channels=blocks_time_embed_dim, + add_downsample=not is_final_block, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim[i], + num_attention_heads=num_attention_heads[i], + downsample_padding=downsample_padding, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + resnet_skip_time_act=resnet_skip_time_act, + resnet_out_scale_factor=resnet_out_scale_factor, + cross_attention_norm=cross_attention_norm, + attention_head_dim=attention_head_dim[i] + if attention_head_dim[i] is not None else output_channel, + ) + self.down_blocks.append(down_block) + + # mid + if mid_block_type == "UNetMidBlock2DCrossAttn": + self.mid_block = UNetMidBlock2DCrossAttn( + transformer_layers_per_block=transformer_layers_per_block[-1], + in_channels=block_out_channels[-1], + temb_channels=blocks_time_embed_dim, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + output_scale_factor=mid_block_scale_factor, + resnet_time_scale_shift=resnet_time_scale_shift, + cross_attention_dim=cross_attention_dim[-1], + num_attention_heads=num_attention_heads[-1], + resnet_groups=norm_num_groups, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + upcast_attention=upcast_attention, + ) + elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn": + self.mid_block = UNetMidBlock2DSimpleCrossAttn( + in_channels=block_out_channels[-1], + temb_channels=blocks_time_embed_dim, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + output_scale_factor=mid_block_scale_factor, + cross_attention_dim=cross_attention_dim[-1], + attention_head_dim=attention_head_dim[-1], + resnet_groups=norm_num_groups, + resnet_time_scale_shift=resnet_time_scale_shift, + skip_time_act=resnet_skip_time_act, + only_cross_attention=mid_block_only_cross_attention, + cross_attention_norm=cross_attention_norm, + ) + elif mid_block_type is None: + self.mid_block = None + else: + raise ValueError(f"unknown mid_block_type : {mid_block_type}") + + # count how many layers upsample the images + self.num_upsamplers = 0 + + # up + reversed_block_out_channels = list(reversed(block_out_channels)) + reversed_num_attention_heads = list(reversed(num_attention_heads)) + reversed_layers_per_block = list(reversed(layers_per_block)) + reversed_cross_attention_dim = list(reversed(cross_attention_dim)) + reversed_transformer_layers_per_block = list( + reversed(transformer_layers_per_block)) + only_cross_attention = list(reversed(only_cross_attention)) + + output_channel = reversed_block_out_channels[0] + for i, up_block_type in enumerate(up_block_types): + is_final_block = i == len(block_out_channels) - 1 + + prev_output_channel = output_channel + output_channel = reversed_block_out_channels[i] + input_channel = reversed_block_out_channels[min( + i + 1, + len(block_out_channels) - 1)] + + # add upsample block for all BUT final layer + if not is_final_block: + add_upsample = True + self.num_upsamplers += 1 + else: + add_upsample = False + + up_block = get_up_block( + up_block_type, + num_layers=reversed_layers_per_block[i] + 1, + transformer_layers_per_block= + reversed_transformer_layers_per_block[i], + in_channels=input_channel, + out_channels=output_channel, + prev_output_channel=prev_output_channel, + temb_channels=blocks_time_embed_dim, + add_upsample=add_upsample, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=reversed_cross_attention_dim[i], + num_attention_heads=reversed_num_attention_heads[i], + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + resnet_skip_time_act=resnet_skip_time_act, + resnet_out_scale_factor=resnet_out_scale_factor, + cross_attention_norm=cross_attention_norm, + attention_head_dim=attention_head_dim[i] + if attention_head_dim[i] is not None else output_channel, + ) + self.up_blocks.append(up_block) + prev_output_channel = output_channel + + # out + if norm_num_groups is not None: + self.conv_norm_out = nn.GroupNorm( + num_channels=block_out_channels[0], + num_groups=norm_num_groups, + eps=norm_eps) + + self.conv_act = get_activation(act_fn) + + else: + self.conv_norm_out = None + self.conv_act = None + + conv_out_padding = (conv_out_kernel - 1) // 2 + self.conv_out = nn.Conv2d(block_out_channels[0], + out_channels, + kernel_size=conv_out_kernel, + padding=conv_out_padding) + + @property + def attn_processors(self) -> Dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors(name: str, module: torch.nn.Module, + processors: Dict[str, + AttentionProcessor]): + if hasattr(module, "set_processor"): + processors[f"{name}.processor"] = module.processor + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, + processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + def set_attn_processor(self, processor: Union[AttentionProcessor, + Dict[str, + AttentionProcessor]]): + r""" + Sets the attention processor to use to compute attention. + + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, + processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, + processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + def set_default_attn_processor(self): + """ + Disables custom attention processors and sets the default attention implementation. + """ + self.set_attn_processor(AttnProcessor()) + + def set_attention_slice(self, slice_size): + r""" + Enable sliced attention computation. + + When this option is enabled, the attention module splits the input tensor in slices to compute attention in + several steps. This is useful for saving some memory in exchange for a small decrease in speed. + + Args: + slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): + When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If + `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. + """ + sliceable_head_dims = [] + + def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module): + if hasattr(module, "set_attention_slice"): + sliceable_head_dims.append(module.sliceable_head_dim) + + for child in module.children(): + fn_recursive_retrieve_sliceable_dims(child) + + # retrieve number of attention layers + for module in self.children(): + fn_recursive_retrieve_sliceable_dims(module) + + num_sliceable_layers = len(sliceable_head_dims) + + if slice_size == "auto": + # half the attention head size is usually a good trade-off between + # speed and memory + slice_size = [dim // 2 for dim in sliceable_head_dims] + elif slice_size == "max": + # make smallest slice possible + slice_size = num_sliceable_layers * [1] + + slice_size = num_sliceable_layers * [slice_size] if not isinstance( + slice_size, list) else slice_size + + if len(slice_size) != len(sliceable_head_dims): + raise ValueError( + f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different" + f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}." + ) + + for i in range(len(slice_size)): + size = slice_size[i] + dim = sliceable_head_dims[i] + if size is not None and size > dim: + raise ValueError( + f"size {size} has to be smaller or equal to {dim}.") + + # Recursively walk through all the children. + # Any children which exposes the set_attention_slice method + # gets the message + def fn_recursive_set_attention_slice(module: torch.nn.Module, + slice_size: List[int]): + if hasattr(module, "set_attention_slice"): + module.set_attention_slice(slice_size.pop()) + + for child in module.children(): + fn_recursive_set_attention_slice(child, slice_size) + + reversed_slice_size = list(reversed(slice_size)) + for module in self.children(): + fn_recursive_set_attention_slice(module, reversed_slice_size) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, + CrossAttnUpBlock2D, UpBlock2D)): + module.gradient_checkpointing = value + + def forward( + self, + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + class_labels: Optional[torch.Tensor] = None, + timestep_cond: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, + down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, + mid_block_additional_residual: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + return_dict: bool = True, + ) -> Union[UNet2DConditionOutput, Tuple]: + r""" + The [`UNet2DConditionModel`] forward method. + + Args: + sample (`torch.FloatTensor`): + The noisy input tensor with the following shape `(batch, channel, height, width)`. + timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.FloatTensor`): + The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. + encoder_attention_mask (`torch.Tensor`): + A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If + `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias, + which adds large negative values to the attention scores corresponding to "discard" tokens. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. + added_cond_kwargs: (`dict`, *optional*): + A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that + are passed along to the UNet blocks. + + Returns: + [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: + If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise + a `tuple` is returned where the first element is the sample tensor. + """ + # By default samples have to be AT least a multiple of the overall upsampling factor. + # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). + # However, the upsampling interpolation output size can be forced to fit any upsampling size + # on the fly if necessary. + default_overall_up_factor = 2**self.num_upsamplers + + # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor` + forward_upsample_size = False + upsample_size = None + + if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]): + logger.info( + "Forward upsample size to force interpolation output size.") + forward_upsample_size = True + + # ensure attention_mask is a bias, and give it a singleton query_tokens dimension + # expects mask of shape: + # [batch, key_tokens] + # adds singleton query_tokens dimension: + # [batch, 1, key_tokens] + # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes: + # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn) + # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn) + if attention_mask is not None: + # assume that mask is expressed as: + # (1 = keep, 0 = discard) + # convert mask into a bias that can be added to attention scores: + # (keep = +0, discard = -10000.0) + attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # convert encoder_attention_mask to a bias the same way we do for attention_mask + if encoder_attention_mask is not None: + encoder_attention_mask = ( + 1 - encoder_attention_mask.to(sample.dtype)) * -10000.0 + encoder_attention_mask = encoder_attention_mask.unsqueeze(1) + + # 0. center input if necessary + if self.config.center_input_sample: + sample = 2 * sample - 1.0 + + # 1. time + timesteps = timestep + if not torch.is_tensor(timesteps): + # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can + # This would be a good case for the `match` statement (Python 3.10+) + is_mps = sample.device.type == "mps" + if isinstance(timestep, float): + dtype = torch.float32 if is_mps else torch.float64 + else: + dtype = torch.int32 if is_mps else torch.int64 + timesteps = torch.tensor([timesteps], + dtype=dtype, + device=sample.device) + elif len(timesteps.shape) == 0: + timesteps = timesteps[None].to(sample.device) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timesteps = timesteps.expand(sample.shape[0]) + + t_emb = self.time_proj(timesteps) + + # `Timesteps` does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=sample.dtype) + + emb = self.time_embedding(t_emb, timestep_cond) + aug_emb = None + + if self.class_embedding is not None: + if class_labels is None: + raise ValueError( + "class_labels should be provided when num_class_embeds > 0" + ) + + if self.config.class_embed_type == "timestep": + class_labels = self.time_proj(class_labels) + + # `Timesteps` does not contain any weights and will always return f32 tensors + # there might be better ways to encapsulate this. + class_labels = class_labels.to(dtype=sample.dtype) + + class_emb = self.class_embedding(class_labels).to( + dtype=sample.dtype) + + if self.config.class_embeddings_concat: + emb = torch.cat([emb, class_emb], dim=-1) + else: + emb = emb + class_emb + + if self.config.addition_embed_type == "text": + aug_emb = self.add_embedding(encoder_hidden_states) + elif self.config.addition_embed_type == "text_image": + # Kandinsky 2.1 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + + image_embs = added_cond_kwargs.get("image_embeds") + text_embs = added_cond_kwargs.get("text_embeds", + encoder_hidden_states) + aug_emb = self.add_embedding(text_embs, image_embs) + elif self.config.addition_embed_type == "text_time": + # SDXL - style + if "text_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`" + ) + text_embeds = added_cond_kwargs.get("text_embeds") + if "time_ids" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`" + ) + time_ids = added_cond_kwargs.get("time_ids") + time_embeds = self.add_time_proj(time_ids.flatten()) + time_embeds = time_embeds.reshape((text_embeds.shape[0], -1)) + + add_embeds = torch.concat([text_embeds, time_embeds], dim=-1) + add_embeds = add_embeds.to(emb.dtype) + aug_emb = self.add_embedding(add_embeds) + elif self.config.addition_embed_type == "image": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + aug_emb = self.add_embedding(image_embs) + elif self.config.addition_embed_type == "image_hint": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + hint = added_cond_kwargs.get("hint") + aug_emb, hint = self.add_embedding(image_embs, hint) + sample = torch.cat([sample, hint], dim=1) + + emb = emb + aug_emb if aug_emb is not None else emb + + if self.time_embed_act is not None: + emb = self.time_embed_act(emb) + + if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj": + encoder_hidden_states = self.encoder_hid_proj( + encoder_hidden_states) + elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj": + # Kadinsky 2.1 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj( + encoder_hidden_states, image_embeds) + elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj(image_embeds) + # 2. pre-process + sample = self.conv_in(sample) + + # 3. down + + is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None + is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None + + down_block_res_samples = (sample, ) + for downsample_block in self.down_blocks: + if hasattr(downsample_block, "has_cross_attention" + ) and downsample_block.has_cross_attention: + # For t2i-adapter CrossAttnDownBlock2D + additional_residuals = {} + if is_adapter and len(down_block_additional_residuals) > 0: + additional_residuals[ + "additional_residuals"] = down_block_additional_residuals.pop( + 0) + + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, + **additional_residuals, + ) + else: + sample, res_samples = downsample_block(hidden_states=sample, + temb=emb) + + if is_adapter and len(down_block_additional_residuals) > 0: + sample += down_block_additional_residuals.pop(0) + + down_block_res_samples += res_samples + + if is_controlnet: + new_down_block_res_samples = () + + for down_block_res_sample, down_block_additional_residual in zip( + down_block_res_samples, down_block_additional_residuals): + down_block_res_sample = down_block_res_sample + down_block_additional_residual + new_down_block_res_samples = new_down_block_res_samples + ( + down_block_res_sample, ) + + down_block_res_samples = new_down_block_res_samples + + # 4. mid + if self.mid_block is not None: + sample = self.mid_block( + sample, + emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, + ) + + if is_controlnet: + sample = sample + mid_block_additional_residual + + # 5. up + for i, upsample_block in enumerate(self.up_blocks): + is_final_block = i == len(self.up_blocks) - 1 + + res_samples = down_block_res_samples[-len(upsample_block.resnets):] + down_block_res_samples = down_block_res_samples[:-len( + upsample_block.resnets)] + + # if we have not reached the final block and need to forward the + # upsample size, we do it here + if not is_final_block and forward_upsample_size: + upsample_size = down_block_res_samples[-1].shape[2:] + + if hasattr(upsample_block, "has_cross_attention" + ) and upsample_block.has_cross_attention: + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + upsample_size=upsample_size, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + ) + else: + sample = upsample_block(hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + upsample_size=upsample_size) + + # 6. post-process + if self.conv_norm_out: + sample = self.conv_norm_out(sample) + sample = self.conv_act(sample) + sample = self.conv_out(sample) + return sample diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/requirements.txt b/inference/benchmarks/stable_diffusion_v1_4/pytorch/requirements.txt new file mode 100644 index 000000000..2bd1558a3 --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/requirements.txt @@ -0,0 +1,3 @@ +transformers +diffusers +torchmetrics diff --git a/inference/benchmarks/vit_l_16/README.md b/inference/benchmarks/vit_l_16/README.md new file mode 100644 index 000000000..5998c0cf9 --- /dev/null +++ b/inference/benchmarks/vit_l_16/README.md @@ -0,0 +1,86 @@ +### 1. 推理数据集 +> Download website:https://image-net.org/ + +We use ImageNet2012 Validation Images: +| Dataset | FileName | Size | Checksum | +| ----------------------------- | ---------------------- | ----- | ------------------------------------- | +| Validation images (all tasks) | ILSVRC2012_img_val.tar | 6.3GB | MD5: 29b22e2961454d5413ddabcf34fc5622 | +Dataset format conversion: +https://github.com/pytorch/examples/blob/main/imagenet/extract_ILSVRC.sh + +make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar are in the same directory with extract_ILSVRC.sh. +```bash +sh extract_ILSVRC.sh +``` + +preview directory structures of decompressed dataset. + +```bash +tree -d -L 1 +``` + +``` +. +├── train +└── val +``` +dataset samples size + +```bash +find ./val -name "*JPEG" | wc -l +50000 +``` + +### 2. 模型与权重 + +* 模型实现 + * pytorch:transformers.ViTForImageClassification(hugging face) +* 权重下载 + * pytorch:from_pretrained("google/vit-large-patch16-224")(hugging face) + +### 2. 软硬件配置与运行信息参考 + +#### 2.1 Nvidia A100 + +- ##### 硬件环境 + - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-113-generic + - 加速卡驱动版本:470.129.06 + - Docker 版本:20.10.16 + - 训练框架版本:pytorch-1.13.0a0+937e930 + - 依赖软件版本: + - cuda: 11.8 + +- 推理工具包 + + - TensorRT 8.5.1.7 + - torch_tensorrt 1.3.0 + +### 3. 运行情况 + +* 指标列表 + +| 指标名称 | 指标值索引 | 特殊说明 | +| ------------------ | ---------------- | -------------------------------------------- | +| 数据精度 | precision | 可选fp32/fp16 | +| 批尺寸 | bs | | +| 硬件存储使用 | mem | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time | 总时间+Perf初始化等时间 | +| 验证总吞吐量 | p_val_whole | 实际验证图片数除以总验证时间 | +| 验证计算吞吐量 | p_val_core | 不包含IO部分耗时 | +| 推理总吞吐量 | p_infer_whole | 实际推理图片数除以总推理时间 | +| **推理计算吞吐量** | **\*p_infer_core** | 不包含IO部分耗时 | +| **计算卡使用率** | **\*MFU** | model flops utilization | +| 推理结果 | acc(推理/验证) | 单位为top1分类准确率(acc1) | + +* 指标值 + +| 推理工具 | precision | bs | e2e_time | p_val_whole | p_val_core | p_infer_whole | \*p_infer_core | \*MFU | acc | mem | +| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | +| tensorrt | fp16 | 64 |1009.7 | 777.8 | 796.7 | 825.8 | 1329.2 | 26.2% | 79.0/79.3 | 35.0/40.0 | +| tensorrt | fp32 | 32 | 1275.9 | 482.4 | 491.1 | 555.5 | 590.5 | 23.3% | 79.3/79.3 | 35.0/40.0 | + diff --git a/inference/benchmarks/vit_l_16/pytorch/__init__.py b/inference/benchmarks/vit_l_16/pytorch/__init__.py new file mode 100644 index 000000000..1f6cdf49b --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/__init__.py @@ -0,0 +1,5 @@ +from .dataloader import build_dataloader +from .model import create_model +from .export import export_model +from .evaluator import evaluator +from .forward import model_forward, engine_forward diff --git a/inference/benchmarks/vit_l_16/pytorch/dataloader.py b/inference/benchmarks/vit_l_16/pytorch/dataloader.py new file mode 100644 index 000000000..d08453f1e --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/dataloader.py @@ -0,0 +1,49 @@ +import torchvision as tv +from torch.utils.data import DataLoader as dl +import torch +import tqdm + + +def build_dataset(config): + crop = 256 + c_crop = 224 + mean = (0.485, 0.456, 0.406) + std = (0.229, 0.224, 0.225) + + if config.fp16: + + class ToFloat16(object): + + def __call__(self, tensor): + return tensor.to(dtype=torch.float16) + + tx = tv.transforms.Compose([ + tv.transforms.Resize(crop), + tv.transforms.CenterCrop(c_crop), + tv.transforms.ToTensor(), + ToFloat16(), + tv.transforms.Normalize(mean=mean, std=std), + ]) + dataset = tv.datasets.ImageFolder(config.data_dir, tx) + else: + tx = tv.transforms.Compose([ + tv.transforms.Resize(crop), + tv.transforms.CenterCrop(c_crop), + tv.transforms.ToTensor(), + tv.transforms.Normalize(mean=mean, std=std), + ]) + dataset = tv.datasets.ImageFolder(config.data_dir, tx) + + return dataset + + +def build_dataloader(config): + dataset = build_dataset(config) + loader = dl(dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=True, + num_workers=config.num_workers, + pin_memory=True) + + return loader diff --git a/inference/benchmarks/vit_l_16/pytorch/evaluator.py b/inference/benchmarks/vit_l_16/pytorch/evaluator.py new file mode 100644 index 000000000..5481c5e5b --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/evaluator.py @@ -0,0 +1,10 @@ +def topk(output, target, ks=(1, )): + _, pred = output.topk(max(ks), 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + return [correct[:k].max(0)[0] for k in ks] + + +def evaluator(pred, ground_truth): + top1, top5 = topk(pred, ground_truth, ks=(1, 5)) + return top1 diff --git a/inference/benchmarks/vit_l_16/pytorch/export.py b/inference/benchmarks/vit_l_16/pytorch/export.py new file mode 100644 index 000000000..3df1a821b --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/export.py @@ -0,0 +1,34 @@ +import torch +import os + + +def export_model(model, config): + if config.exist_onnx_path is not None: + return config.exist_onnx_path + + filename = config.case + "_bs" + str(config.batch_size) + filename = filename + "_" + str(config.framework) + filename = filename + "_fp16" + str(config.fp16) + filename = "onnxs/" + filename + ".onnx" + onnx_path = config.perf_dir + "/" + filename + + dummy_input = torch.randn(config.batch_size, 3, 224, 224) + + if config.fp16: + dummy_input = dummy_input.half() + dummy_input = dummy_input.cuda() + + dir_onnx_path = os.path.dirname(onnx_path) + os.makedirs(dir_onnx_path, exist_ok=True) + + with torch.no_grad(): + torch.onnx.export(model, + dummy_input, + onnx_path, + verbose=False, + input_names=["input"], + output_names=["output"], + training=torch.onnx.TrainingMode.EVAL, + do_constant_folding=True) + + return onnx_path diff --git a/inference/benchmarks/vit_l_16/pytorch/forward.py b/inference/benchmarks/vit_l_16/pytorch/forward.py new file mode 100644 index 000000000..a61caf685 --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/forward.py @@ -0,0 +1,106 @@ +from loguru import logger +import torch +import numpy as np +import time +from tools import torch_sync + + +def cal_perf(config, dataloader_len, duration, core_time, str_prefix): + model_forward_perf = config.repeat * dataloader_len * config.batch_size / duration + logger.info(str_prefix + "(" + config.framework + ") Perf: " + + str(model_forward_perf) + " ips") + model_forward_core_perf = config.repeat * dataloader_len * config.batch_size / core_time + logger.info(str_prefix + "(" + config.framework + ") core Perf: " + + str(model_forward_core_perf) + " ips") + return round(model_forward_perf, 3), round(model_forward_core_perf, 3) + + +def model_forward(model, dataloader, evaluator, config): + if config.no_validation: + return None, None, None + start = time.time() + core_time = 0.0 + acc = [] + + for times in range(config.repeat): + + logger.debug("Repeat: " + str(times + 1)) + + all_top1 = [] + for step, (x, y) in enumerate(dataloader): + torch_sync(config) + core_time_start = time.time() + + if step % config.log_freq == 0: + logger.debug("Step: " + str(step) + " / " + + str(len(dataloader))) + + with torch.no_grad(): + + x = x.cuda() + y = y.cuda() + pred = model(x)[0] + torch_sync(config) + core_time += time.time() - core_time_start + + top1 = evaluator(pred, y) + + all_top1.extend(top1.cpu()) + + acc.append(np.mean(all_top1)) + + logger.info("Top1 Acc: " + str(acc)) + + duration = time.time() - start + model_forward_perf, model_forward_core_perf = cal_perf( + config, len(dataloader), duration, core_time, "Validation") + + return model_forward_perf, model_forward_core_perf, round( + float(np.mean(acc)), 3) + + +def engine_forward(model, dataloader, evaluator, config): + start = time.time() + core_time = 0.0 + foo_time = 0.0 + acc = [] + + for times in range(config.repeat): + + logger.debug("Repeat: " + str(times + 1)) + + all_top1 = [] + for step, (x, y) in enumerate(dataloader): + torch_sync(config) + core_time_start = time.time() + + if step % config.log_freq == 0: + logger.debug("Step: " + str(step) + " / " + + str(len(dataloader))) + + with torch.no_grad(): + + outputs = model([x]) + pred = outputs[0] + foo_time += outputs[1] + + torch_sync(config) + core_time += time.time() - core_time_start + + pred = pred[0].float() + pred = pred.reshape(config.batch_size, -1) + pred = pred.cpu() + top1 = evaluator(pred, y) + + all_top1.extend(top1.cpu()) + + acc.append(np.mean(all_top1)) + + logger.info("Top1 Acc: " + str(acc)) + + duration = time.time() - start - foo_time + model_forward_perf, model_forward_core_perf = cal_perf( + config, len(dataloader), duration, core_time - foo_time, "Inference") + + return model_forward_perf, model_forward_core_perf, round( + float(np.mean(acc)), 3) diff --git a/inference/benchmarks/vit_l_16/pytorch/model.py b/inference/benchmarks/vit_l_16/pytorch/model.py new file mode 100644 index 000000000..186148119 --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/model.py @@ -0,0 +1,14 @@ +from transformers import ViTForImageClassification as vit + + +def create_model(config): + if config.no_validation: + assert config.exist_onnx_path is not None + return None + model = vit.from_pretrained(config.weights) + model.cuda() + model.eval() + if config.fp16: + model.half() + + return model diff --git a/inference/benchmarks/vit_l_16/pytorch/requirements.txt b/inference/benchmarks/vit_l_16/pytorch/requirements.txt new file mode 100644 index 000000000..976a2b1f3 --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/requirements.txt @@ -0,0 +1 @@ +transformers diff --git a/inference/configs/stable_diffusion_v1_4/configurations.yaml b/inference/configs/stable_diffusion_v1_4/configurations.yaml new file mode 100644 index 000000000..77014a03b --- /dev/null +++ b/inference/configs/stable_diffusion_v1_4/configurations.yaml @@ -0,0 +1,16 @@ +batch_size: 2 +# 1 item(like 1 sequence, 1 image) flops +# Attention! For transformer decoder like bert, 1 token cause 2*param flops, so we need 2*length*params like 2*512*0.33B here +# format: a_1*a*2*...*a_nea_0,like 2*512*0.33e9(bert) or 4.12e9(resnet50) +flops: 6.78e11 +fp16: false +compiler: tensorrt +num_workers: 8 +log_freq: 5 +repeat: 1 +# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null +no_validation: false +# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt) +exist_onnx_path: null +# set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine +exist_compiler_path: null diff --git a/inference/configs/stable_diffusion_v1_4/parameters.yaml b/inference/configs/stable_diffusion_v1_4/parameters.yaml new file mode 100644 index 000000000..b8d6d33f0 --- /dev/null +++ b/inference/configs/stable_diffusion_v1_4/parameters.yaml @@ -0,0 +1,14 @@ +weights: "weights_v1_4" +eval_weights: "weights_evaluator" +prompts: "data_vizwiz/val.json" +random_seed: 0 +prompt_max_len: 77 +in_channels: 4 +height: 512 +width: 512 +scale_size: 8 +num_inference_steps: 50 +guidance_scale: 7.5 +prompt_samples: 10 +num_train_timesteps: 1000 +embed_hidden_size: 768 diff --git a/inference/configs/stable_diffusion_v1_4/vendor_config/nvidia_configurations.yaml b/inference/configs/stable_diffusion_v1_4/vendor_config/nvidia_configurations.yaml new file mode 100644 index 000000000..130eff42e --- /dev/null +++ b/inference/configs/stable_diffusion_v1_4/vendor_config/nvidia_configurations.yaml @@ -0,0 +1,3 @@ +trt_tmp_path: nvidia_tmp/unet.trt +has_dynamic_axis: false +torchtrt_full_compile: true \ No newline at end of file diff --git a/inference/configs/vit_l_16/configurations.yaml b/inference/configs/vit_l_16/configurations.yaml new file mode 100644 index 000000000..da9354aa0 --- /dev/null +++ b/inference/configs/vit_l_16/configurations.yaml @@ -0,0 +1,16 @@ +batch_size: 32 +# 1 item(like 1 sequence, 1 image) flops +# Attention! For transformer decoder like bert, 1 token cause 2*param flops, so we need 2*length*params like 2*512*0.33B here +# format: a_1*a*2*...*a_nea_0,like 2*512*0.33e9(bert) or 4.12e9(resnet50) +flops: 6.16e10 +fp16: false +compiler: tensorrt +num_workers: 8 +log_freq: 30 +repeat: 5 +# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null +no_validation: false +# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt) +exist_onnx_path: null +# set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine +exist_compiler_path: null \ No newline at end of file diff --git a/inference/configs/vit_l_16/parameters.yaml b/inference/configs/vit_l_16/parameters.yaml new file mode 100644 index 000000000..d5d7da9dd --- /dev/null +++ b/inference/configs/vit_l_16/parameters.yaml @@ -0,0 +1 @@ +weights: "google/vit-large-patch16-224" \ No newline at end of file diff --git a/inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml b/inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml new file mode 100644 index 000000000..5fc40bbf6 --- /dev/null +++ b/inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml @@ -0,0 +1,3 @@ +trt_tmp_path: nvidia_tmp/vit.trt +has_dynamic_axis: false +torchtrt_full_compile: true \ No newline at end of file