Compile VAE V2 (huggingface#46)

jackalcooper · leaves-zwx · web-flow · commit 99a8c6fda779 · 2022-12-07T19:47:21.000+08:00
Co-authored-by: leaves-zwx &lt;kunta0932@gmail.com&gt;
diff --git a/src/diffusers/oneflow_graph_compile_cache.py b/src/diffusers/oneflow_graph_compile_cache.py
@@ -0,0 +1,113 @@
+from collections import deque
+from timeit import default_timer as timer
+from .utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class OneFlowGraph(object):
+    def __init__(self, graph_class, *args, **kwargs):
+        self.graph_ = graph_class(*args, **kwargs)
+        self.is_compiled_ = False
+
+    @property
+    def is_compiled(self):
+        return self.is_compiled_
+
+    def compile(self, *args, **kwargs):
+        if self.is_compiled_:
+            return
+
+        global_class_name = self.graph_.__class__.__name__
+        logger.info(
+            f"[oneflow] compiling {global_class_name} beforehand to make sure the progress bar is more accurate",
+        )
+        compilation_start = timer()
+        compilation_time = 0
+        self.graph_._compile(*args, **kwargs)
+        compilation_time = timer() - compilation_start
+        logger.info(f"[oneflow] [elapsed(s)] [{global_class_name} compilation] {compilation_time:.3f}")
+
+        self.is_compiled_ = True
+
+    def __call__(self, *args, **kwargs):
+        if not self.is_compiled_:
+            self.compile(*args, **kwargs)
+
+        return self.graph_(*args, **kwargs)
+
+
+class LRUCache(object):
+    def __init__(self, cache_size):
+        self.cache_size = cache_size
+        self.queue = deque()
+        self.hash_map = dict()
+
+    def is_queue_full(self):
+        return len(self.queue) == self.cache_size
+
+    def pop(self):
+        pop_key = self.queue.pop()
+        value = self.hash_map.pop(pop_key)
+        del value
+        return pop_key
+
+    def set(self, key, value):
+        if key in self.hash_map:
+            return None
+
+        pop_key = None
+        while self.is_queue_full():
+            pop_key = self.pop()
+
+        self.queue.appendleft(key)
+        self.hash_map[key] = value
+        return pop_key if pop_key is not None else key
+
+    def get(self, key):
+        if key in self.hash_map:
+            self.queue.remove(key)
+            self.queue.appendleft(key)
+            return self.hash_map[key]
+
+        return None
+
+
+class OneFlowGraphCompileCache(object):
+    def __init__(self, cache_size=1):
+        self.cache_size_ = cache_size
+        self.cache_bucket_ = dict()
+
+    def set_cache_size(self, cache_size):
+        self.cache_size_ = cache_size
+
+        for cache in self.cache_bucket_.values():
+            cache.cache_size = cache_size
+
+    def get_graph(self, graph_class, cache_key, *args, **kwargs):
+        graph_class_name = graph_class.__name__
+        if graph_class_name not in self.cache_bucket_:
+            self.cache_bucket_[graph_class_name] = LRUCache(self.cache_size_)
+
+        compile_cache = self.cache_bucket_[graph_class_name]
+
+        graph = compile_cache.get(cache_key)
+        if graph is None:
+            graph = OneFlowGraph(graph_class, *args, **kwargs)
+            ret = compile_cache.set(cache_key, graph)
+            assert ret is not None
+
+            if ret != cache_key:
+                logger.info(
+                    f"[oneflow] a {graph_class_name} with cache key {ret} "
+                    "is deleted from cache according to the LRU policy",
+                )
+                if self.cache_size_ == 1:
+                    logger.info("[oneflow] cache size can be changed by `set_cache_size`")
+
+            logger.info(
+                f"[oneflow] a {graph_class_name} with cache key {cache_key} is appending to "
+                f"cache (cache_size={compile_cache.cache_size})",
+            )
+
+        return graph
diff --git a/src/diffusers/pipeline_oneflow_utils.py b/src/diffusers/pipeline_oneflow_utils.py
@@ -50,6 +50,7 @@
     is_transformers_available,
     logging,
 )
+from .oneflow_graph_compile_cache import OneFlowGraphCompileCache
 
 
 if is_transformers_available():
@@ -159,6 +160,12 @@ class OneFlowDiffusionPipeline(ConfigMixin):
     config_name = "model_index.json"
     _optional_components = []
 
+    def init_graph_compile_cache(self, cache_size):
+        self.graph_compile_cache = OneFlowGraphCompileCache(cache_size)
+
+    def set_graph_compile_cache_size(self, cache_size):
+        self.graph_compile_cache.set_cache_size(cache_size)
+
     def register_modules(self, **kwargs):
         # import it here to avoid circular import
         from diffusers import pipelines
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -312,10 +312,7 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
         else:
             attention_mask = None
 
-        text_embeddings = self.text_encoder(
-            text_input_ids.to(device),
-            attention_mask=attention_mask,
-        )
+        text_embeddings = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
         text_embeddings = text_embeddings[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_oneflow.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_oneflow.py
@@ -37,12 +37,13 @@
 from . import StableDiffusionPipelineOutput
 from .safety_checker_oneflow import OneFlowStableDiffusionSafetyChecker as StableDiffusionSafetyChecker
 
-
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 from timeit import default_timer as timer
 import os
 import oneflow as flow
+
+
 class UNetGraph(flow.nn.Graph):
     def __init__(self, unet):
         super().__init__()
@@ -55,6 +56,37 @@ def build(self, latent_model_input, t, text_embeddings):
         text_embeddings = torch._C.amp_white_identity(text_embeddings)
         return self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
 
+
+class VaePostProcess(flow.nn.Module):
+    def __init__(self, vae) -> None:
+        super().__init__()
+        self.vae = vae
+
+    def forward(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        return image
+
+
+class VaeGraph(flow.nn.Graph):
+    def __init__(self, vae_post_process) -> None:
+        super().__init__()
+        self.vae_post_process = vae_post_process
+
+    def build(self, latents):
+        return self.vae_post_process(latents)
+
+
+class TextEncoderGraph(flow.nn.Graph):
+    def __init__(self, text_encoder) -> None:
+        super().__init__()
+        self.text_encoder = text_encoder
+
+    def build(self, text_input, attention_mask):
+        return self.text_encoder(text_input, attention_mask)[0]
+
+
 class OneFlowStableDiffusionPipeline(DiffusionPipeline):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion.
@@ -189,9 +221,7 @@ def __init__(
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
-        self.unet_graphs = dict()
-        self.unet_graphs_cache_size = 1
-        self.unet_graphs_lru_cache_time = 0
+        self.init_graph_compile_cache(1)
 
     def enable_xformers_memory_efficient_attention(self):
         r"""
@@ -288,9 +318,6 @@ def _execution_device(self):
         `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
         hooks.
         """
-        '''
-        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
-        '''
         if not hasattr(self.unet, "_hf_hook"):
             return self.device
         for module in self.unet.modules():
@@ -345,10 +372,7 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
         else:
             attention_mask = None
 
-        text_embeddings = self.text_encoder(
-            text_input_ids.to(device),
-            attention_mask=attention_mask,
-        )
+        text_embeddings = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
         text_embeddings = text_embeddings[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
@@ -480,14 +504,13 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
     def set_unet_graphs_cache_size(self, cache_size: int):
         r"""
         Set the cache size of compiled unet graphs.
-
         This option is designed to control the GPU memory size.
-
         Args:
             cache_size ([`int`]):
                 New cache size, i.e., the maximum number of unet graphs.
         """
-        self.unet_graphs_cache_size = cache_size
+        logger.warning(f"`set_unet_graphs_cache_size` is deprecated, please use `set_graph_compile_cache_size` instead.")
+        self.set_graph_compile_cache_size(cache_size)
 
     @torch.no_grad()
     def __call__(
@@ -507,6 +530,7 @@ def __call__(
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
         compile_unet: bool = True,
+        compile_vae: bool = True,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -599,35 +623,25 @@ def __call__(
             latents,
         )
 
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # compile vae graph
+        if compile_vae:
+            cache_key = (height, width, num_images_per_prompt)
+            vae_post_process = VaePostProcess(self.vae)
+            vae_post_process.eval()
+            vae_post_process_graph = self.graph_compile_cache.get_graph(VaeGraph, cache_key, vae_post_process)
+            vae_post_process_graph.compile(latents)
 
-        compilation_start = timer()
-        compilation_time = 0
+        # compile unet graph
         if compile_unet:
-            self.unet_graphs_lru_cache_time += 1
-            if (height, width) in self.unet_graphs:
-                _, unet_graph = self.unet_graphs[height, width]
-                self.unet_graphs[height, width] = (self.unet_graphs_lru_cache_time, unet_graph)
-            else:
-                while len(self.unet_graphs) >= self.unet_graphs_cache_size:
-                    shape_to_del = min(self.unet_graphs.keys(), key=lambda shape: self.unet_graphs[shape][0])
-                    print("[oneflow]", f"a compiled unet (height={shape_to_del[0]}, width={shape_to_del[1]}) "
-                          "is deleted according to the LRU policy")
-                    print("[oneflow]", "cache size can be changed by `pipeline.set_unet_graphs_cache_size`")
-                    del self.unet_graphs[shape_to_del]
-                print("[oneflow]", "compiling unet beforehand to make sure the progress bar is more accurate")
-                i, t = list(enumerate(self.scheduler.timesteps))[0]
-
+            cache_key = (height, width, num_images_per_prompt)
+            unet_graph = self.graph_compile_cache.get_graph(UNetGraph, cache_key, self.unet)
+            if unet_graph.is_compiled is False:
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                _, t = list(enumerate(self.scheduler.timesteps))[0]
+                unet_graph.compile(latent_model_input, t, text_embeddings)
 
-                unet_graph = UNetGraph(self.unet)
-                unet_graph._compile(latent_model_input, t, text_embeddings)
-                unet_graph(latent_model_input, t, text_embeddings) # warmup
-                compilation_time = timer() - compilation_start
-                print("[oneflow]", "[elapsed(s)]", "[unet compilation]", compilation_time)
-                self.unet_graphs[height, width] = (self.unet_graphs_lru_cache_time, unet_graph)
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
@@ -660,7 +674,11 @@ def __call__(
                         callback(i, t, latents)
 
         # 8. Post-processing
-        image = self.decode_latents(latents)
+        if compile_vae:
+            image = vae_post_process_graph(latents)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        else:
+            image = self.decode_latents(latents)
 
         # 9. Run safety checker
         image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)