From e8189816815f971946880616fc18c9330329dcc9 Mon Sep 17 00:00:00 2001
From: SHIHONGHAO <13820618441@163.com>
Date: Thu, 10 Aug 2023 16:59:55 +0800
Subject: [PATCH] stable diffusion stdcase (#191)

* bert

* fix

* add

* add MFU

* vit

* addsrc

* sd
---
 .../stable_diffusion_v1_4/README.md           |   60 +
 .../stable_diffusion_v1_4/pytorch/__init__.py |    5 +
 .../pytorch/dataloader.py                     |   31 +
 .../pytorch/evaluator.py                      |   12 +
 .../stable_diffusion_v1_4/pytorch/export.py   |   41 +
 .../stable_diffusion_v1_4/pytorch/forward.py  |  251 ++++
 .../stable_diffusion_v1_4/pytorch/model.py    |   16 +
 .../pytorch/model_utils/unet2d.py             | 1064 +++++++++++++++++
 .../pytorch/requirements.txt                  |    3 +
 inference/benchmarks/vit_l_16/README.md       |   86 ++
 .../benchmarks/vit_l_16/pytorch/__init__.py   |    5 +
 .../benchmarks/vit_l_16/pytorch/dataloader.py |   49 +
 .../benchmarks/vit_l_16/pytorch/evaluator.py  |   10 +
 .../benchmarks/vit_l_16/pytorch/export.py     |   34 +
 .../benchmarks/vit_l_16/pytorch/forward.py    |  106 ++
 .../benchmarks/vit_l_16/pytorch/model.py      |   14 +
 .../vit_l_16/pytorch/requirements.txt         |    1 +
 .../stable_diffusion_v1_4/configurations.yaml |   16 +
 .../stable_diffusion_v1_4/parameters.yaml     |   14 +
 .../vendor_config/nvidia_configurations.yaml  |    3 +
 .../configs/vit_l_16/configurations.yaml      |   16 +
 inference/configs/vit_l_16/parameters.yaml    |    1 +
 .../vendor_config/nvidia_configurations.yaml  |    3 +
 23 files changed, 1841 insertions(+)
 create mode 100644 inference/benchmarks/stable_diffusion_v1_4/README.md
 create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py
 create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py
 create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py
 create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py
 create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/forward.py
 create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/model.py
 create mode 100755 inference/benchmarks/stable_diffusion_v1_4/pytorch/model_utils/unet2d.py
 create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/requirements.txt
 create mode 100644 inference/benchmarks/vit_l_16/README.md
 create mode 100644 inference/benchmarks/vit_l_16/pytorch/__init__.py
 create mode 100644 inference/benchmarks/vit_l_16/pytorch/dataloader.py
 create mode 100644 inference/benchmarks/vit_l_16/pytorch/evaluator.py
 create mode 100644 inference/benchmarks/vit_l_16/pytorch/export.py
 create mode 100644 inference/benchmarks/vit_l_16/pytorch/forward.py
 create mode 100644 inference/benchmarks/vit_l_16/pytorch/model.py
 create mode 100644 inference/benchmarks/vit_l_16/pytorch/requirements.txt
 create mode 100644 inference/configs/stable_diffusion_v1_4/configurations.yaml
 create mode 100644 inference/configs/stable_diffusion_v1_4/parameters.yaml
 create mode 100644 inference/configs/stable_diffusion_v1_4/vendor_config/nvidia_configurations.yaml
 create mode 100644 inference/configs/vit_l_16/configurations.yaml
 create mode 100644 inference/configs/vit_l_16/parameters.yaml
 create mode 100644 inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml

diff --git a/inference/benchmarks/stable_diffusion_v1_4/README.md b/inference/benchmarks/stable_diffusion_v1_4/README.md
new file mode 100644
index 000000000..07aade914
--- /dev/null
+++ b/inference/benchmarks/stable_diffusion_v1_4/README.md
@@ -0,0 +1,60 @@
+### 1. 推理数据集
+
+
+### 2. 模型与权重
+
+* 模型实现
+  * pytorch：transformers.UNet2DConditionalModel
+* 权重下载
+  * pytorch：from_pretrained("CompViz/stable-diffusion-v1-4")
+
+### 2. 软硬件配置与运行信息参考
+
+#### 2.1 Nvidia A100
+
+- ##### 硬件环境
+    - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB
+    - 多机网络类型、带宽: InfiniBand，200Gb/s
+    
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.4.0-113-generic
+   - 加速卡驱动版本：470.129.06
+   - Docker 版本：20.10.16
+   - 训练框架版本：pytorch-2.1.0a0+4136153
+   - 依赖软件版本：
+     - cuda: 12.1
+   
+- 推理工具包
+
+   - TensorRT 8.6.1
+   
+- 其他说明
+
+   - 本case在大批尺寸情况下涉及到了张量超过4B的情况，因此在大批尺寸离线批推理场景下，不宜作为性能及MFU基准。
+
+### 3. 运行情况
+
+* 指标列表
+
+| 指标名称           | 指标值索引       | 特殊说明                                     |
+| ------------------ | ---------------- | -------------------------------------------- |
+| 数据精度           | precision        | 可选fp32/fp16                                |
+| 批尺寸             | bs               |                                              |
+| 硬件存储使用       | mem              | 通常称为“显存”,单位为GiB                     |
+| 端到端时间         | e2e_time         | 总时间+Perf初始化等时间                      |
+| 验证总吞吐量       | p_val_whole      | 实际验证prompts数除以总验证时间          |
+| 验证计算吞吐量     | p_val_core       | 不包含IO部分耗时                             |
+| 推理总吞吐量       | p_infer_whole    | 实际推理prompts数除以总推理时间          |
+| **推理计算吞吐量** | **\*p_infer_core** | 不包含IO部分耗时                             |
+| **计算卡使用率** | **\*MFU** | model flops utilization                             |
+| 推理结果           | CLIP Score(推理/验证) | 单位为text2img耦合度分数       |
+
+* 指标值
+
+| 推理工具  | precision | bs   | e2e_time | p_val_whole | p_val_core | p_infer_whole | \*p_infer_core | \*MFU     | CLIP Score  | mem        |
+| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- |
+| tensorrt | fp16    | 2   |1674.9 | 11.4        | 45.2 | 10.6 | 60.6 | 13.2% | 17.1/25.2 | 13.3/40.0 |
+| tensorrt | fp32   | 2 | 1807.4 | 8.2 | 20.6 | 7.2  | 16.1 | 7.0% | 25.2/25.3 | 39.2/40.0 |
+| null | fp16 | 16 | / | 11.7 | 60.7 | /  | / | 13.2% | -/25.2 | 5.7/40.0 |
+| null | fp32 | 8 | / | 9.3 | 27.3 | /  | / | 11.9% | -/25.3 | 6.3/40.0 |
diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py
new file mode 100644
index 000000000..1f6cdf49b
--- /dev/null
+++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py
@@ -0,0 +1,5 @@
+from .dataloader import build_dataloader
+from .model import create_model
+from .export import export_model
+from .evaluator import evaluator
+from .forward import model_forward, engine_forward
diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py
new file mode 100644
index 000000000..94f00f2f2
--- /dev/null
+++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py
@@ -0,0 +1,31 @@
+from torch.utils.data import DataLoader as dl
+import torch
+import json
+import random
+
+
+def build_dataset(config):
+
+    df = json.load(open(config.data_dir + "/" + config.prompts))["annotations"]
+    prompts = []
+    for item in df:
+        prompts.append(item["caption"])
+    dataset = [
+        item for item in prompts if len(item) < config.prompt_max_len - 2
+    ]
+    random.seed(config.random_seed)
+    dataset = random.sample(dataset, config.prompt_samples)
+
+    return dataset
+
+
+def build_dataloader(config):
+    dataset = build_dataset(config)
+    loader = dl(dataset,
+                batch_size=config.batch_size,
+                shuffle=False,
+                drop_last=True,
+                num_workers=config.num_workers,
+                pin_memory=True)
+
+    return loader
diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py
new file mode 100644
index 000000000..824323809
--- /dev/null
+++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py
@@ -0,0 +1,12 @@
+import torch
+
+
+def evaluator(metric, image, prompt, config):
+    scores = []
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    image = (image * 255).round().astype("uint8")
+    image = torch.tensor(image)
+    for i in range(config.batch_size):
+        scores.append(float(metric(image[i], prompt[i])))
+    return scores
diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py
new file mode 100644
index 000000000..60fa8fbb8
--- /dev/null
+++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py
@@ -0,0 +1,41 @@
+import torch
+import os
+
+
+def export_model(model, config):
+    if config.exist_onnx_path is not None:
+        return config.exist_onnx_path
+
+    filename = config.case + "_bs" + str(config.batch_size)
+    filename = filename + "_" + str(config.framework)
+    filename = filename + "_fp16" + str(config.fp16)
+    filename = "onnxs/" + filename + ".onnx"
+    onnx_path = config.perf_dir + "/" + filename
+
+    latent = torch.randn(config.batch_size * 2, config.in_channels,
+                         config.height // config.scale_size,
+                         config.width // config.scale_size).cuda().float()
+    t = torch.randn([]).cuda().int()
+    embed = torch.randn(config.batch_size * 2, config.prompt_max_len,
+                        config.embed_hidden_size).cuda().float()
+
+    if config.fp16:
+        latent = latent.half()
+        embed = embed.half()
+
+    dummy_input = (latent, t, embed)
+
+    dir_onnx_path = os.path.dirname(onnx_path)
+    os.makedirs(dir_onnx_path, exist_ok=True)
+
+    with torch.no_grad():
+        torch.onnx.export(model,
+                          dummy_input,
+                          onnx_path,
+                          verbose=False,
+                          input_names=["input_0", "input_1", "input_2"],
+                          output_names=["output_0"],
+                          training=torch.onnx.TrainingMode.EVAL,
+                          do_constant_folding=True)
+
+    return onnx_path
diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/forward.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/forward.py
new file mode 100644
index 000000000..a9314a90a
--- /dev/null
+++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/forward.py
@@ -0,0 +1,251 @@
+from loguru import logger
+import torch
+import numpy as np
+import time
+from tools import torch_sync
+from diffusers import AutoencoderKL, UNet2DConditionModel, DDIMScheduler
+from transformers import CLIPTextModel, CLIPTokenizer
+from torchmetrics.multimodal import CLIPScore
+
+
+def cal_perf(config, dataloader_len, duration, core_time, str_prefix):
+    model_forward_perf = config.repeat * dataloader_len * config.batch_size * config.num_inference_steps / duration
+    logger.info(str_prefix + "(" + config.framework + ") Perf: " +
+                str(model_forward_perf) + " ips")
+    model_forward_core_perf = config.repeat * dataloader_len * config.batch_size * config.num_inference_steps / core_time
+    logger.info(str_prefix + "(" + config.framework + ") core Perf: " +
+                str(model_forward_core_perf) + " ips")
+    return round(model_forward_perf, 3), round(model_forward_core_perf, 3)
+
+
+def model_forward(model, dataloader, evaluator, config):
+    if config.no_validation:
+        return None, None, None
+    vae = AutoencoderKL.from_pretrained(config.data_dir + "/" + config.weights,
+                                        subfolder="vae")
+    tokenizer = CLIPTokenizer.from_pretrained(config.data_dir + "/" +
+                                              config.weights,
+                                              subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(config.data_dir + "/" +
+                                                 config.weights,
+                                                 subfolder="text_encoder")
+    noise_scheduler = DDIMScheduler(
+        num_train_timesteps=config.num_train_timesteps,
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        clip_sample=False,
+        set_alpha_to_one=False,
+    )
+    vae.eval()
+    text_encoder.eval()
+
+    metric = CLIPScore(model_name_or_path=config.data_dir + "/" +
+                       config.eval_weights)
+    metric.eval()
+
+    generator = torch.Generator().manual_seed(config.random_seed)
+
+    start = time.time()
+    core_time = 0.0
+    scores = []
+    for times in range(config.repeat):
+
+        logger.debug("Repeat: " + str(times + 1))
+        for step, prompt in enumerate(dataloader):
+            if step % config.log_freq == 0:
+                logger.debug("Step: " + str(step) + " / " +
+                             str(len(dataloader)))
+                             
+            with torch.no_grad():
+                text_input = tokenizer(prompt,
+                                       padding="max_length",
+                                       max_length=tokenizer.model_max_length,
+                                       truncation=True,
+                                       return_tensors="pt")
+
+                text_embeddings = text_encoder(text_input.input_ids)[0]
+
+                max_length = text_input.input_ids.shape[-1]
+                uncond_input = tokenizer([""] * config.batch_size,
+                                         padding="max_length",
+                                         max_length=max_length,
+                                         return_tensors="pt")
+
+                uncond_embeddings = text_encoder(uncond_input.input_ids)[0]
+                text_embeddings = torch.cat(
+                    [uncond_embeddings, text_embeddings])
+
+                latents = torch.randn(
+                    (config.batch_size, config.in_channels, config.height //
+                     config.scale_size, config.width // config.scale_size),
+                    generator=generator)
+
+                noise_scheduler.set_timesteps(config.num_inference_steps)
+
+                timesteps_tensor = torch.linspace(
+                    config.num_train_timesteps -
+                    config.num_train_timesteps // config.num_inference_steps,
+                    0, config.num_inference_steps).int()
+
+                for t in timesteps_tensor:
+                    latent_model_input = torch.cat([latents] * 2)
+
+                    torch_sync(config)
+                    core_time_start = time.time()
+                    if config.fp16:
+                        noise_pred = model(
+                            latent_model_input.cuda().to(torch.float16),
+                            t.cuda(),
+                            text_embeddings.cuda().to(torch.float16))
+                    else:
+                        noise_pred = model(latent_model_input.cuda(), t.cuda(),
+                                           text_embeddings.cuda())
+
+                    torch_sync(config)
+                    core_time += time.time() - core_time_start
+
+                    noise_pred = noise_pred.to(torch.float32).cpu()
+
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + config.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond)
+
+                    latents = noise_scheduler.step(noise_pred, t,
+                                                   latents).prev_sample
+
+                latents = 1 / 0.18215 * latents               
+                image = vae.decode(latents).sample
+
+                scores_iter = evaluator(metric, image, prompt, config)
+                for score in scores_iter:
+                    scores.append(score)                  
+
+    duration = time.time() - start
+    logger.info("CLIP Scores: " + str(np.mean(scores)))
+
+    duration = time.time() - start
+    model_forward_perf, model_forward_core_perf = cal_perf(
+        config, len(dataloader), duration, core_time, "Validation")
+
+    return model_forward_perf, model_forward_core_perf, round(
+        float(np.mean(scores)), 3)
+
+
+def engine_forward(model, dataloader, evaluator, config):
+    vae = AutoencoderKL.from_pretrained(config.data_dir + "/" + config.weights,
+                                        subfolder="vae")
+    tokenizer = CLIPTokenizer.from_pretrained(config.data_dir + "/" +
+                                              config.weights,
+                                              subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(config.data_dir + "/" +
+                                                 config.weights,
+                                                 subfolder="text_encoder")
+    noise_scheduler = DDIMScheduler(
+        num_train_timesteps=config.num_train_timesteps,
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        clip_sample=False,
+        set_alpha_to_one=False,
+    )
+    vae.eval()
+    text_encoder.eval()
+
+    metric = CLIPScore(model_name_or_path=config.data_dir + "/" +
+                       config.eval_weights)
+    metric.eval()
+
+    generator = torch.Generator().manual_seed(config.random_seed)
+
+    start = time.time()
+    core_time = 0.0
+    scores = []
+    for times in range(config.repeat):
+
+        logger.debug("Repeat: " + str(times + 1))
+        for step, prompt in enumerate(dataloader):
+            if step % config.log_freq == 0:
+                logger.debug("Step: " + str(step) + " / " +
+                             str(len(dataloader)))
+
+            with torch.no_grad():
+                text_input = tokenizer(prompt,
+                                       padding="max_length",
+                                       max_length=tokenizer.model_max_length,
+                                       truncation=True,
+                                       return_tensors="pt")
+
+                text_embeddings = text_encoder(text_input.input_ids)[0]
+
+                max_length = text_input.input_ids.shape[-1]
+                uncond_input = tokenizer([""] * config.batch_size,
+                                         padding="max_length",
+                                         max_length=max_length,
+                                         return_tensors="pt")
+
+                uncond_embeddings = text_encoder(uncond_input.input_ids)[0]
+                text_embeddings = torch.cat(
+                    [uncond_embeddings, text_embeddings])
+
+                latents = torch.randn(
+                    (config.batch_size, config.in_channels, config.height //
+                     config.scale_size, config.width // config.scale_size),
+                    generator=generator)
+
+                noise_scheduler.set_timesteps(config.num_inference_steps)
+
+                timesteps_tensor = torch.linspace(
+                    config.num_train_timesteps -
+                    config.num_train_timesteps // config.num_inference_steps,
+                    0, config.num_inference_steps).int()
+
+                for t in timesteps_tensor:
+                    latent_model_input = torch.cat([latents] * 2)
+
+                    inputs = [latent_model_input, t, text_embeddings]
+                    if config.fp16:
+                        inputs = [
+                            latent_model_input.to(torch.float16), t,
+                            text_embeddings.to(torch.float16)
+                        ]
+
+                    torch_sync(config)
+                    core_time_start = time.time()
+                    outputs = model(inputs)
+                    noise_pred = outputs[0]
+                    foo_time = outputs[1]
+
+                    torch_sync(config)
+                    core_time += time.time() - core_time_start
+
+                    noise_pred = noise_pred[0].float()
+                    noise_pred = noise_pred.reshape(
+                        config.batch_size * 2, config.in_channels,
+                        config.height // config.scale_size,
+                        config.width // config.scale_size)
+                    noise_pred = noise_pred.cpu()
+
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + config.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond)
+
+                    latents = noise_scheduler.step(noise_pred, t,
+                                                   latents).prev_sample
+
+                latents = 1 / 0.18215 * latents               
+                image = vae.decode(latents).sample
+
+                scores_iter = evaluator(metric, image, prompt, config)
+                for score in scores_iter:
+                    scores.append(score)                
+
+    duration = time.time() - start
+    logger.info("CLIP Scores: " + str(np.mean(scores)))
+
+    duration = time.time() - start
+    model_forward_perf, model_forward_core_perf = cal_perf(
+        config, len(dataloader), duration, core_time, "Inference")
+
+    return model_forward_perf, model_forward_core_perf, round(
+        float(np.mean(scores)), 3)
diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/model.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/model.py
new file mode 100644
index 000000000..e1b4db5cb
--- /dev/null
+++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/model.py
@@ -0,0 +1,16 @@
+from .model_utils.unet2d import UNet2DConditionModel
+
+
+def create_model(config):
+    if config.no_validation:
+        assert config.exist_onnx_path is not None
+        return None
+    model = UNet2DConditionModel.from_pretrained(config.data_dir + "/" +
+                                                 config.weights,
+                                                 subfolder="unet")
+    model.cuda()
+    model.eval()
+    if config.fp16:
+        model.half()
+
+    return model
diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/model_utils/unet2d.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/model_utils/unet2d.py
new file mode 100755
index 000000000..cc803a9f3
--- /dev/null
+++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/model_utils/unet2d.py
@@ -0,0 +1,1064 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    UpBlock2D,
+    get_down_block,
+    get_up_block,
+)
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    """
+    The output of [`UNet2DConditionModel`].
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+
+    sample: torch.FloatTensor = None
+
+
+class UNet2DConditionModel(ModelMixin, ConfigMixin,
+                           UNet2DConditionLoadersMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be either `UNetMidBlock2DCrossAttn` or
+            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D",
+                                      "CrossAttnUpBlock2D",
+                                      "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(
+                only_cross_attention,
+                bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(
+                num_attention_heads,
+                int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(
+                attention_head_dim,
+                int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(
+                cross_attention_dim,
+                list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(
+                layers_per_block,
+                int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(in_channels,
+                                 block_out_channels[0],
+                                 kernel_size=conv_in_kernel,
+                                 padding=conv_in_padding)
+
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(
+                    f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}."
+                )
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2,
+                set_W_to_weight=False,
+                log=False,
+                flip_sin_to_cos=flip_sin_to_cos)
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos,
+                                       freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info(
+                "encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined."
+            )
+
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim,
+                                              cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds,
+                                                time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim,
+                                                     time_embed_dim,
+                                                     act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(
+                projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(
+                projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim,
+                time_embed_dim,
+                num_heads=addition_embed_type_num_heads)
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim,
+                image_embed_dim=cross_attention_dim,
+                time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim,
+                                           flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(
+                projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(
+                image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(
+                image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(
+                f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'."
+            )
+
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+
+            only_cross_attention = [only_cross_attention
+                                    ] * len(down_block_types)
+
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (
+                num_attention_heads, ) * len(down_block_types)
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim, ) * len(down_block_types)
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (
+                cross_attention_dim, ) * len(down_block_types)
+
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block
+                                            ] * len(down_block_types)
+
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i]
+                if attention_head_dim[i] is not None else output_channel,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                attention_head_dim=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                skip_time_act=resnet_skip_time_act,
+                only_cross_attention=mid_block_only_cross_attention,
+                cross_attention_norm=cross_attention_norm,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(
+            reversed(transformer_layers_per_block))
+        only_cross_attention = list(reversed(only_cross_attention))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(
+                i + 1,
+                len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=
+                reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i]
+                if attention_head_dim[i] is not None else output_channel,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0],
+                num_groups=norm_num_groups,
+                eps=norm_eps)
+
+            self.conv_act = get_activation(act_fn)
+
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(block_out_channels[0],
+                                  out_channels,
+                                  kernel_size=conv_out_kernel,
+                                  padding=conv_out_padding)
+
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module,
+                                        processors: Dict[str,
+                                                         AttentionProcessor]):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child,
+                                            processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(self, processor: Union[AttentionProcessor,
+                                                  Dict[str,
+                                                       AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module,
+                                        processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child,
+                                            processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(AttnProcessor())
+
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(
+            slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(
+                    f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module,
+                                             slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D,
+                               CrossAttnUpBlock2D, UpBlock2D)):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info(
+                "Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (
+                1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps],
+                                     dtype=dtype,
+                                     device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError(
+                    "class_labels should be provided when num_class_embeds > 0"
+                )
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+
+            class_emb = self.class_embedding(class_labels).to(
+                dtype=sample.dtype)
+
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds",
+                                              encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(
+                encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(
+                encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 3. down
+
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
+
+        down_block_res_samples = (sample, )
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention"
+                       ) and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    additional_residuals[
+                        "additional_residuals"] = down_block_additional_residuals.pop(
+                            0)
+
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample,
+                                                       temb=emb)
+
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    sample += down_block_additional_residuals.pop(0)
+
+            down_block_res_samples += res_samples
+
+        if is_controlnet:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                    down_block_res_samples, down_block_additional_residuals):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (
+                    down_block_res_sample, )
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets):]
+            down_block_res_samples = down_block_res_samples[:-len(
+                upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention"
+                       ) and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(hidden_states=sample,
+                                        temb=emb,
+                                        res_hidden_states_tuple=res_samples,
+                                        upsample_size=upsample_size)
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        return sample
diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/requirements.txt b/inference/benchmarks/stable_diffusion_v1_4/pytorch/requirements.txt
new file mode 100644
index 000000000..2bd1558a3
--- /dev/null
+++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/requirements.txt
@@ -0,0 +1,3 @@
+transformers
+diffusers
+torchmetrics
diff --git a/inference/benchmarks/vit_l_16/README.md b/inference/benchmarks/vit_l_16/README.md
new file mode 100644
index 000000000..5998c0cf9
--- /dev/null
+++ b/inference/benchmarks/vit_l_16/README.md
@@ -0,0 +1,86 @@
+### 1. 推理数据集
+> Download website：https://image-net.org/
+
+We use ImageNet2012 Validation Images:
+| Dataset                       | FileName               | Size  | Checksum                              |
+| ----------------------------- | ---------------------- | ----- | ------------------------------------- |
+| Validation images (all tasks) | ILSVRC2012_img_val.tar | 6.3GB | MD5: 29b22e2961454d5413ddabcf34fc5622 |
+Dataset format conversion：
+https://github.com/pytorch/examples/blob/main/imagenet/extract_ILSVRC.sh
+
+make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar are in the same directory with extract_ILSVRC.sh.
+```bash
+sh extract_ILSVRC.sh
+```
+
+preview directory structures of decompressed dataset.
+
+```bash
+tree -d -L 1
+```
+
+```
+.
+├── train
+└── val
+```
+dataset samples size
+
+```bash
+find ./val -name "*JPEG" | wc -l
+50000
+```
+
+### 2. 模型与权重
+
+* 模型实现
+  * pytorch：transformers.ViTForImageClassification（hugging face）
+* 权重下载
+  * pytorch：from_pretrained("google/vit-large-patch16-224")（hugging face）
+
+### 2. 软硬件配置与运行信息参考
+
+#### 2.1 Nvidia A100
+
+- ##### 硬件环境
+    - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB
+    - 多机网络类型、带宽: InfiniBand，200Gb/s
+    
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.4.0-113-generic
+   - 加速卡驱动版本：470.129.06
+   - Docker 版本：20.10.16
+   - 训练框架版本：pytorch-1.13.0a0+937e930
+   - 依赖软件版本：
+     - cuda: 11.8
+   
+- 推理工具包
+
+   - TensorRT 8.5.1.7
+   - torch_tensorrt 1.3.0
+
+### 3. 运行情况
+
+* 指标列表
+
+| 指标名称           | 指标值索引       | 特殊说明                                     |
+| ------------------ | ---------------- | -------------------------------------------- |
+| 数据精度           | precision        | 可选fp32/fp16                                |
+| 批尺寸             | bs               |                                              |
+| 硬件存储使用       | mem              | 通常称为“显存”,单位为GiB                     |
+| 端到端时间         | e2e_time         | 总时间+Perf初始化等时间                      |
+| 验证总吞吐量       | p_val_whole      | 实际验证图片数除以总验证时间                 |
+| 验证计算吞吐量     | p_val_core       | 不包含IO部分耗时                             |
+| 推理总吞吐量       | p_infer_whole    | 实际推理图片数除以总推理时间                 |
+| **推理计算吞吐量** | **\*p_infer_core** | 不包含IO部分耗时                             |
+| **计算卡使用率** | **\*MFU** | model flops utilization                             |
+| 推理结果           | acc(推理/验证)   | 单位为top1分类准确率(acc1)                   |
+
+* 指标值
+
+| 推理工具  | precision | bs   | e2e_time | p_val_whole | p_val_core | p_infer_whole | \*p_infer_core | \*MFU     | acc         | mem        |
+| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- |
+| tensorrt | fp16    | 64   |1009.7 | 777.8 | 796.7 | 825.8 | 1329.2 | 26.2% | 79.0/79.3 | 35.0/40.0 |
+| tensorrt | fp32   | 32 | 1275.9 | 482.4  | 491.1 | 555.5    | 590.5 | 23.3% | 79.3/79.3 | 35.0/40.0 |
+
diff --git a/inference/benchmarks/vit_l_16/pytorch/__init__.py b/inference/benchmarks/vit_l_16/pytorch/__init__.py
new file mode 100644
index 000000000..1f6cdf49b
--- /dev/null
+++ b/inference/benchmarks/vit_l_16/pytorch/__init__.py
@@ -0,0 +1,5 @@
+from .dataloader import build_dataloader
+from .model import create_model
+from .export import export_model
+from .evaluator import evaluator
+from .forward import model_forward, engine_forward
diff --git a/inference/benchmarks/vit_l_16/pytorch/dataloader.py b/inference/benchmarks/vit_l_16/pytorch/dataloader.py
new file mode 100644
index 000000000..d08453f1e
--- /dev/null
+++ b/inference/benchmarks/vit_l_16/pytorch/dataloader.py
@@ -0,0 +1,49 @@
+import torchvision as tv
+from torch.utils.data import DataLoader as dl
+import torch
+import tqdm
+
+
+def build_dataset(config):
+    crop = 256
+    c_crop = 224
+    mean = (0.485, 0.456, 0.406)
+    std = (0.229, 0.224, 0.225)
+
+    if config.fp16:
+
+        class ToFloat16(object):
+
+            def __call__(self, tensor):
+                return tensor.to(dtype=torch.float16)
+
+        tx = tv.transforms.Compose([
+            tv.transforms.Resize(crop),
+            tv.transforms.CenterCrop(c_crop),
+            tv.transforms.ToTensor(),
+            ToFloat16(),
+            tv.transforms.Normalize(mean=mean, std=std),
+        ])
+        dataset = tv.datasets.ImageFolder(config.data_dir, tx)
+    else:
+        tx = tv.transforms.Compose([
+            tv.transforms.Resize(crop),
+            tv.transforms.CenterCrop(c_crop),
+            tv.transforms.ToTensor(),
+            tv.transforms.Normalize(mean=mean, std=std),
+        ])
+        dataset = tv.datasets.ImageFolder(config.data_dir, tx)
+
+    return dataset
+
+
+def build_dataloader(config):
+    dataset = build_dataset(config)
+    loader = dl(dataset,
+                batch_size=config.batch_size,
+                shuffle=False,
+                drop_last=True,
+                num_workers=config.num_workers,
+                pin_memory=True)
+
+    return loader
diff --git a/inference/benchmarks/vit_l_16/pytorch/evaluator.py b/inference/benchmarks/vit_l_16/pytorch/evaluator.py
new file mode 100644
index 000000000..5481c5e5b
--- /dev/null
+++ b/inference/benchmarks/vit_l_16/pytorch/evaluator.py
@@ -0,0 +1,10 @@
+def topk(output, target, ks=(1, )):
+    _, pred = output.topk(max(ks), 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    return [correct[:k].max(0)[0] for k in ks]
+
+
+def evaluator(pred, ground_truth):
+    top1, top5 = topk(pred, ground_truth, ks=(1, 5))
+    return top1
diff --git a/inference/benchmarks/vit_l_16/pytorch/export.py b/inference/benchmarks/vit_l_16/pytorch/export.py
new file mode 100644
index 000000000..3df1a821b
--- /dev/null
+++ b/inference/benchmarks/vit_l_16/pytorch/export.py
@@ -0,0 +1,34 @@
+import torch
+import os
+
+
+def export_model(model, config):
+    if config.exist_onnx_path is not None:
+        return config.exist_onnx_path
+
+    filename = config.case + "_bs" + str(config.batch_size)
+    filename = filename + "_" + str(config.framework)
+    filename = filename + "_fp16" + str(config.fp16)
+    filename = "onnxs/" + filename + ".onnx"
+    onnx_path = config.perf_dir + "/" + filename
+
+    dummy_input = torch.randn(config.batch_size, 3, 224, 224)
+
+    if config.fp16:
+        dummy_input = dummy_input.half()
+    dummy_input = dummy_input.cuda()
+
+    dir_onnx_path = os.path.dirname(onnx_path)
+    os.makedirs(dir_onnx_path, exist_ok=True)
+
+    with torch.no_grad():
+        torch.onnx.export(model,
+                          dummy_input,
+                          onnx_path,
+                          verbose=False,
+                          input_names=["input"],
+                          output_names=["output"],
+                          training=torch.onnx.TrainingMode.EVAL,
+                          do_constant_folding=True)
+
+    return onnx_path
diff --git a/inference/benchmarks/vit_l_16/pytorch/forward.py b/inference/benchmarks/vit_l_16/pytorch/forward.py
new file mode 100644
index 000000000..a61caf685
--- /dev/null
+++ b/inference/benchmarks/vit_l_16/pytorch/forward.py
@@ -0,0 +1,106 @@
+from loguru import logger
+import torch
+import numpy as np
+import time
+from tools import torch_sync
+
+
+def cal_perf(config, dataloader_len, duration, core_time, str_prefix):
+    model_forward_perf = config.repeat * dataloader_len * config.batch_size / duration
+    logger.info(str_prefix + "(" + config.framework + ") Perf: " +
+                str(model_forward_perf) + " ips")
+    model_forward_core_perf = config.repeat * dataloader_len * config.batch_size / core_time
+    logger.info(str_prefix + "(" + config.framework + ") core Perf: " +
+                str(model_forward_core_perf) + " ips")
+    return round(model_forward_perf, 3), round(model_forward_core_perf, 3)
+
+
+def model_forward(model, dataloader, evaluator, config):
+    if config.no_validation:
+        return None, None, None
+    start = time.time()
+    core_time = 0.0
+    acc = []
+
+    for times in range(config.repeat):
+
+        logger.debug("Repeat: " + str(times + 1))
+
+        all_top1 = []
+        for step, (x, y) in enumerate(dataloader):
+            torch_sync(config)
+            core_time_start = time.time()
+
+            if step % config.log_freq == 0:
+                logger.debug("Step: " + str(step) + " / " +
+                             str(len(dataloader)))
+
+            with torch.no_grad():
+
+                x = x.cuda()
+                y = y.cuda()
+                pred = model(x)[0]
+                torch_sync(config)
+                core_time += time.time() - core_time_start
+
+                top1 = evaluator(pred, y)
+
+                all_top1.extend(top1.cpu())
+
+        acc.append(np.mean(all_top1))
+
+    logger.info("Top1 Acc: " + str(acc))
+
+    duration = time.time() - start
+    model_forward_perf, model_forward_core_perf = cal_perf(
+        config, len(dataloader), duration, core_time, "Validation")
+
+    return model_forward_perf, model_forward_core_perf, round(
+        float(np.mean(acc)), 3)
+
+
+def engine_forward(model, dataloader, evaluator, config):
+    start = time.time()
+    core_time = 0.0
+    foo_time = 0.0
+    acc = []
+
+    for times in range(config.repeat):
+
+        logger.debug("Repeat: " + str(times + 1))
+
+        all_top1 = []
+        for step, (x, y) in enumerate(dataloader):
+            torch_sync(config)
+            core_time_start = time.time()
+
+            if step % config.log_freq == 0:
+                logger.debug("Step: " + str(step) + " / " +
+                             str(len(dataloader)))
+
+            with torch.no_grad():
+
+                outputs = model([x])
+                pred = outputs[0]
+                foo_time += outputs[1]
+
+                torch_sync(config)
+                core_time += time.time() - core_time_start
+
+                pred = pred[0].float()
+                pred = pred.reshape(config.batch_size, -1)
+                pred = pred.cpu()
+                top1 = evaluator(pred, y)
+
+                all_top1.extend(top1.cpu())
+
+        acc.append(np.mean(all_top1))
+
+    logger.info("Top1 Acc: " + str(acc))
+
+    duration = time.time() - start - foo_time
+    model_forward_perf, model_forward_core_perf = cal_perf(
+        config, len(dataloader), duration, core_time - foo_time, "Inference")
+
+    return model_forward_perf, model_forward_core_perf, round(
+        float(np.mean(acc)), 3)
diff --git a/inference/benchmarks/vit_l_16/pytorch/model.py b/inference/benchmarks/vit_l_16/pytorch/model.py
new file mode 100644
index 000000000..186148119
--- /dev/null
+++ b/inference/benchmarks/vit_l_16/pytorch/model.py
@@ -0,0 +1,14 @@
+from transformers import ViTForImageClassification as vit
+
+
+def create_model(config):
+    if config.no_validation:
+        assert config.exist_onnx_path is not None
+        return None
+    model = vit.from_pretrained(config.weights)
+    model.cuda()
+    model.eval()
+    if config.fp16:
+        model.half()
+
+    return model
diff --git a/inference/benchmarks/vit_l_16/pytorch/requirements.txt b/inference/benchmarks/vit_l_16/pytorch/requirements.txt
new file mode 100644
index 000000000..976a2b1f3
--- /dev/null
+++ b/inference/benchmarks/vit_l_16/pytorch/requirements.txt
@@ -0,0 +1 @@
+transformers
diff --git a/inference/configs/stable_diffusion_v1_4/configurations.yaml b/inference/configs/stable_diffusion_v1_4/configurations.yaml
new file mode 100644
index 000000000..77014a03b
--- /dev/null
+++ b/inference/configs/stable_diffusion_v1_4/configurations.yaml
@@ -0,0 +1,16 @@
+batch_size: 2
+# 1 item(like 1 sequence, 1 image) flops
+# Attention! For transformer decoder like bert, 1 token cause 2*param flops, so we need 2*length*params like 2*512*0.33B here
+# format: a_1*a*2*...*a_nea_0,like 2*512*0.33e9(bert) or 4.12e9(resnet50)
+flops: 6.78e11
+fp16: false
+compiler: tensorrt
+num_workers: 8
+log_freq: 5
+repeat: 1
+# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null
+no_validation: false
+# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt)
+exist_onnx_path: null
+# set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine
+exist_compiler_path: null
diff --git a/inference/configs/stable_diffusion_v1_4/parameters.yaml b/inference/configs/stable_diffusion_v1_4/parameters.yaml
new file mode 100644
index 000000000..b8d6d33f0
--- /dev/null
+++ b/inference/configs/stable_diffusion_v1_4/parameters.yaml
@@ -0,0 +1,14 @@
+weights: "weights_v1_4"
+eval_weights: "weights_evaluator"
+prompts: "data_vizwiz/val.json"
+random_seed: 0
+prompt_max_len: 77
+in_channels: 4
+height: 512
+width: 512
+scale_size: 8
+num_inference_steps: 50
+guidance_scale: 7.5
+prompt_samples: 10
+num_train_timesteps: 1000
+embed_hidden_size: 768
diff --git a/inference/configs/stable_diffusion_v1_4/vendor_config/nvidia_configurations.yaml b/inference/configs/stable_diffusion_v1_4/vendor_config/nvidia_configurations.yaml
new file mode 100644
index 000000000..130eff42e
--- /dev/null
+++ b/inference/configs/stable_diffusion_v1_4/vendor_config/nvidia_configurations.yaml
@@ -0,0 +1,3 @@
+trt_tmp_path: nvidia_tmp/unet.trt
+has_dynamic_axis: false
+torchtrt_full_compile: true
\ No newline at end of file
diff --git a/inference/configs/vit_l_16/configurations.yaml b/inference/configs/vit_l_16/configurations.yaml
new file mode 100644
index 000000000..da9354aa0
--- /dev/null
+++ b/inference/configs/vit_l_16/configurations.yaml
@@ -0,0 +1,16 @@
+batch_size: 32
+# 1 item(like 1 sequence, 1 image) flops
+# Attention! For transformer decoder like bert, 1 token cause 2*param flops, so we need 2*length*params like 2*512*0.33B here
+# format: a_1*a*2*...*a_nea_0,like 2*512*0.33e9(bert) or 4.12e9(resnet50)
+flops: 6.16e10
+fp16: false
+compiler: tensorrt
+num_workers: 8
+log_freq: 30
+repeat: 5
+# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null
+no_validation: false
+# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt)
+exist_onnx_path: null
+# set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine
+exist_compiler_path: null
\ No newline at end of file
diff --git a/inference/configs/vit_l_16/parameters.yaml b/inference/configs/vit_l_16/parameters.yaml
new file mode 100644
index 000000000..d5d7da9dd
--- /dev/null
+++ b/inference/configs/vit_l_16/parameters.yaml
@@ -0,0 +1 @@
+weights: "google/vit-large-patch16-224"
\ No newline at end of file
diff --git a/inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml b/inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml
new file mode 100644
index 000000000..5fc40bbf6
--- /dev/null
+++ b/inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml
@@ -0,0 +1,3 @@
+trt_tmp_path: nvidia_tmp/vit.trt
+has_dynamic_axis: false
+torchtrt_full_compile: true
\ No newline at end of file