TmacAaron
diff --git a/‎bench/bench.py‎
Lines changed: 1 addition & 1 deletion b/‎bench/bench.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bench/perf.py‎
Lines changed: 1 addition & 1 deletion b/‎bench/perf.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/DBCache.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/DBCache.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/User_Guide.md‎
Lines changed: 16 additions & 4 deletions b/‎docs/User_Guide.md‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎docs/community_optimization.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/community_optimization.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/parallelism/run.sh‎
Lines changed: 10 additions & 0 deletions b/‎examples/parallelism/run.sh‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎examples/parallelism/run_flux_cp_npu.py‎
Lines changed: 86 additions & 0 deletions b/‎examples/parallelism/run_flux_cp_npu.py‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎examples/parallelism/run_ltx_video_cp.py‎
Lines changed: 154 additions & 0 deletions b/‎examples/parallelism/run_ltx_video_cp.py‎
Lines changed: 154 additions & 0 deletions
diff --git a/‎examples/parallelism/run_qwen_image_cp.py‎
Lines changed: 4 additions & 2 deletions b/‎examples/parallelism/run_qwen_image_cp.py‎
Lines changed: 4 additions & 2 deletions
@@ -68,7 +68,7 @@ def init_flux_pipe(args: argparse.Namespace) -> FluxPipeline:
                 DBCacheConfig,
                 TaylorSeerCalibratorConfig,
             )
-            from cache_dit.cache_factory.patch_functors import FluxPatchFunctor
+            from cache_dit.caching.patch_functors import FluxPatchFunctor
 
             cache_dit.enable_cache(
                 # BlockAdapter & forward pattern
 
@@ -100,7 +100,7 @@ def main():
                 DBCacheConfig,
                 TaylorSeerCalibratorConfig,
             )
-            from cache_dit.cache_factory.patch_functors import FluxPatchFunctor
+            from cache_dit.caching.patch_functors import FluxPatchFunctor
 
             if args.cache_config is None:
 
 
@@ -16,10 +16,10 @@
 |Baseline(L20x1)|F1B0 (0.08)|F1B0 (0.20)|F8B8 (0.15)|F12B12 (0.20)|F16B16 (0.20)|
 |:---:|:---:|:---:|:---:|:---:|:---:|
 |24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
-|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
+|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=140px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=140px>|
 |**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.12)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
 |27.85s|6.04s|5.88s|5.77s|6.01s|6.20s|
-|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_NONE_R0.08.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F1B0_R0.08.png width=105px> |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B8_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B12_R0.12.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B16_R0.2.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B20_R0.2.png width=105px>|
+|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_NONE_R0.08.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F1B0_R0.08.png width=140px> |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B8_R0.12.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B12_R0.12.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B16_R0.2.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B20_R0.2.png width=140px>|
 
 <div align="center">
   <p align="center">
@@ -79,7 +79,7 @@ cache_dit.enable_cache(
 |Baseline(L20x1)|F1B0 (0.08)|F1B0 (0.20)|F8B8 (0.15)|F12B12 (0.20)|F16B16 (0.20)|
 |:---:|:---:|:---:|:---:|:---:|:---:|
 |24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
-|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=105px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=105px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=105px>|
+|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=140px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=140px>|
 
 ## ⚡️Hybrid Cache CFG
 
 
@@ -314,13 +314,13 @@ For any PATTERN not in {0...5}, we introduced the simple abstract concept of **P
 
 ![](https://github.com/vipshop/cache-dit/raw/main/assets/patch-functor.png)
 
-Some Patch functors have already been provided in cache-dit: [📚HiDreamPatchFunctor](https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/patch_functors/functor_hidream.py), [📚ChromaPatchFunctor](https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/patch_functors/functor_chroma.py), etc. After implementing Patch Functor, users need to set the `patch_functor` property of **BlockAdapter**.
+Some Patch functors have already been provided in cache-dit: [📚HiDreamPatchFunctor](https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/caching/patch_functors/functor_hidream.py), [📚ChromaPatchFunctor](https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/caching/patch_functors/functor_chroma.py), etc. After implementing Patch Functor, users need to set the `patch_functor` property of **BlockAdapter**.
 
 ```python
 @BlockAdapterRegistry.register("HiDream")
 def hidream_adapter(pipe, **kwargs) -> BlockAdapter:
     from diffusers import HiDreamImageTransformer2DModel
-    from cache_dit.cache_factory.patch_functors import HiDreamPatchFunctor
+    from cache_dit.caching.patch_functors import HiDreamPatchFunctor
 
     assert isinstance(pipe.transformer, HiDreamImageTransformer2DModel)
     return BlockAdapter(
@@ -431,6 +431,8 @@ You can set `details` param as `True` to show more details of cache stats. (mark
 - **Fn**: Specifies that DBCache uses the **first n** Transformer blocks to fit the information at time step t, enabling the calculation of a more stable L1 diff and delivering more accurate information to subsequent blocks.
 - **Bn**: Further fuses approximate information in the **last n** Transformer blocks to enhance prediction accuracy. These blocks act as an auto-scaler for approximate hidden states that use residual cache.
 
+![](https://github.com/vipshop/cache-dit/raw/main/assets/dbcache-fnbn-v1.png)
+
 ```python
 import cache_dit
 from diffusers import FluxPipeline
@@ -469,6 +471,17 @@ cache_dit.enable_cache(
 |:---:|:---:|:---:|:---:|:---:|:---:|
 |24.85s|15.59s|8.58s|15.41s|15.11s|17.74s|
 |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/NONE_R0.08_S0.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.08_S11.png width=140px> | <img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F1B0S1_R0.2_S19.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F8B8S1_R0.15_S15.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F12B12S4_R0.2_S16.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/DBCACHE_F16B16S4_R0.2_S13.png width=140px>|
+|**Baseline(L20x1)**|**F1B0 (0.08)**|**F8B8 (0.12)**|**F8B12 (0.12)**|**F8B16 (0.20)**|**F8B20 (0.20)**|
+|27.85s|6.04s|5.88s|5.77s|6.01s|6.20s|
+|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_NONE_R0.08.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F1B0_R0.08.png width=140px> |<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B8_R0.12.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B12_R0.12.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B16_R0.2.png width=140px>|<img src=https://github.com/vipshop/cache-dit/raw/main/assets/TEXTURE_DBCACHE_F8B20_R0.2.png width=140px>|
+
+<div align="center">
+  <p align="center">
+    DBCache, <b> L20x4 </b>, Steps: 20, case to show the texture recovery ability of DBCache
+  </p>
+</div>
+
+These case studies demonstrate that even with relatively high thresholds (such as 0.12, 0.15, 0.2, etc.) under the DBCache **F12B12** or **F8B16** configuration, the detailed texture of the kitten's fur, colored cloth, and the clarity of text can still be preserved. This suggests that users can leverage DBCache to effectively balance performance and precision in their workflows! 
 
 ## ⚡️DBPrune: Dynamic Block Prune
 
@@ -780,7 +793,6 @@ This function seamlessly integrates with both standard diffusion pipelines and c
 - **pipe_or_adapter**(`DiffusionPipeline`, `BlockAdapter` or `Transformer`, *required*):  
   The standard Diffusion Pipeline or custom BlockAdapter (from cache-dit or user-defined).
   For example: `cache_dit.enable_cache(FluxPipeline(...))`.
-  Please check https://github.com/vipshop/cache-dit/blob/main/docs/User_Guide.md for the usage of BlockAdapter.
 
 - **cache_config**(`DBCacheConfig`, *required*, defaults to DBCacheConfig()):  
   Basic DBCache config for cache context, defaults to DBCacheConfig(). The configurable parameters are listed below:
@@ -845,4 +857,4 @@ This function seamlessly integrates with both standard diffusion pipelines and c
         it can include `cp_plan` and `attention_backend` arguments for `Context Parallelism`.
 
 - **kwargs** (`dict`, *optional*, defaults to {}):   
-  Other cache context keyword arguments. Please check https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/cache_contexts/cache_context.py for more details.
+  Other cache context keyword arguments. Please check https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/caching/cache_contexts/cache_context.py for more details.
@@ -122,13 +122,13 @@ For any pattern not included in CacheDiT, use the Patch Functor to convert the p
 
 ![](https://github.com/vipshop/cache-dit/raw/main/assets/patch-functor.png)
 
-Some Patch Functors are already provided in CacheDiT, [HiDreamPatchFunctor](https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/patch_functors/functor_hidream.py), [ChromaPatchFunctor](https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/patch_functors/functor_chroma.py), etc.
+Some Patch Functors are already provided in CacheDiT, [HiDreamPatchFunctor](https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/caching/patch_functors/functor_hidream.py), [ChromaPatchFunctor](https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/caching/patch_functors/functor_chroma.py), etc.
 
 ```python
 @BlockAdapterRegistry.register("HiDream")
 def hidream_adapter(pipe, **kwargs) -> BlockAdapter:
     from diffusers import HiDreamImageTransformer2DModel
-    from cache_dit.cache_factory.patch_functors import HiDreamPatchFunctor
+    from cache_dit.caching.patch_functors import HiDreamPatchFunctor
 
     assert isinstance(pipe.transformer, HiDreamImageTransformer2DModel)
     return BlockAdapter(
 
@@ -0,0 +1,10 @@
+export HCCL_OP_EXPANSION_MODE="AIV"
+export TASK_QUEUE_ENABLE=2
+export CPU_AFFINITY_CONF=2
+
+export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD
+
+
+FLUX_DIR=/home/weights/FLUX.1-dev/ torchrun --nproc_per_node=1 run_flux_cp_npu.py --attn "_native_npu" --height 1024 --width 1024
+
+# WAN_2_2_DIR=/home/weights/Wan2.1-T2V-14B-Diffusers/ torchrun --nproc_per_node=8 run_wan_cp_npu.py --attn "_native_npu" --height 1024 --width 1024 --steps 10 --parallel ulysses --vae-dp
@@ -0,0 +1,86 @@
+import os
+import sys
+
+sys.path.append("..")
+
+import time
+import torch
+import torch_npu
+from torch_npu.contrib import transfer_to_npu
+
+from diffusers import (
+    FluxPipeline,
+    FluxTransformer2DModel,
+    PipelineQuantizationConfig,
+)
+from utils import (
+    get_args,
+    strify,
+    cachify,
+    maybe_init_distributed,
+    maybe_destroy_distributed,
+)
+import cache_dit
+from cache_dit.npu_optim import npu_optimize
+
+
+npu_optimize([
+    "npu_fast_gelu",
+    "npu_rms_norm",
+    "npu_layer_norm_eval",
+    "npu_rotary_mul",
+])
+
+args = get_args()
+print(args)
+
+rank, device = maybe_init_distributed(args)
+
+pipe: FluxPipeline = FluxPipeline.from_pretrained(
+    os.environ.get(
+        "FLUX_DIR",
+        "black-forest-labs/FLUX.1-dev",
+    ),
+    torch_dtype=torch.bfloat16,
+).to("cuda")
+
+if args.cache or args.parallel_type is not None:
+    cachify(args, pipe)
+
+assert isinstance(pipe.transformer, FluxTransformer2DModel)
+
+pipe.set_progress_bar_config(disable=rank != 0)
+
+
+def run_pipe(pipe: FluxPipeline):
+    image = pipe(
+        "A cat holding a sign that says hello world",
+        height=1024 if args.height is None else args.height,
+        width=1024 if args.width is None else args.width,
+        num_inference_steps=28 if args.steps is None else args.steps,
+        generator=torch.Generator("cpu").manual_seed(0),
+    ).images[0]
+    return image
+
+
+if args.compile:
+    cache_dit.set_compile_configs()
+    pipe.transformer = torch.compile(pipe.transformer)
+
+# warmup
+_ = run_pipe(pipe)
+
+start = time.time()
+image = run_pipe(pipe)
+end = time.time()
+
+if rank == 0:
+    cache_dit.summary(pipe)
+
+    time_cost = end - start
+    save_path = f"flux.{strify(args, pipe)}.png"
+    print(f"Time cost: {time_cost:.2f}s")
+    print(f"Saving image to {save_path}")
+    image.save(save_path)
+
+maybe_destroy_distributed()
@@ -0,0 +1,154 @@
+import os
+import sys
+
+sys.path.append("..")
+
+import time
+import torch
+from diffusers import (
+    LTXConditionPipeline,
+    LTXLatentUpsamplePipeline,
+    AutoencoderKLLTXVideo,
+)
+from diffusers.quantizers import PipelineQuantizationConfig
+from diffusers.utils import export_to_video
+from utils import (
+    cachify,
+    get_args,
+    maybe_destroy_distributed,
+    maybe_init_distributed,
+    strify,
+)
+import cache_dit
+
+# NOTE: Please use `--attn flash` for LTXVideo with context parallelism,
+# otherwise, it may raise attention mask not supported error.
+
+args = get_args()
+print(args)
+
+rank, device = maybe_init_distributed(args)
+
+pipe = LTXConditionPipeline.from_pretrained(
+    os.environ.get("LTX_VIDEO_DIR", "Lightricks/LTX-Video-0.9.7-dev"),
+    torch_dtype=torch.bfloat16,
+    quantization_config=PipelineQuantizationConfig(
+        quant_backend="bitsandbytes_4bit",
+        quant_kwargs={
+            "load_in_4bit": True,
+            "bnb_4bit_quant_type": "nf4",
+            "bnb_4bit_compute_dtype": torch.bfloat16,
+        },
+        components_to_quantize=["text_encoder", "transformer"],
+    ),
+)
+
+pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained(
+    os.environ.get(
+        "LTX_UPSCALER_DIR", "Lightricks/ltxv-spatial-upscaler-0.9.7"
+    ),
+    vae=pipe.vae,
+    torch_dtype=torch.bfloat16,
+)
+
+pipe.to(device)
+pipe_upsample.to(device)
+assert isinstance(pipe.vae, AutoencoderKLLTXVideo)
+assert isinstance(pipe_upsample.vae, AutoencoderKLLTXVideo)
+
+pipe.set_progress_bar_config(disable=rank != 0)
+pipe_upsample.set_progress_bar_config(disable=rank != 0)
+
+if args.cache or args.parallel_type is not None:
+    cachify(args, pipe)
+
+
+def round_to_nearest_resolution_acceptable_by_vae(height, width):
+    height = height - (height % pipe.vae_spatial_compression_ratio)
+    width = width - (width % pipe.vae_spatial_compression_ratio)
+    return height, width
+
+
+prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
+negative_prompt = (
+    "worst quality, inconsistent motion, blurry, jittery, distorted"
+)
+expected_height, expected_width = 512, 704
+downscale_factor = 2 / 3
+num_frames = 49
+
+# Part 1. Generate video at smaller resolution
+downscaled_height, downscaled_width = int(
+    expected_height * downscale_factor
+), int(expected_width * downscale_factor)
+downscaled_height, downscaled_width = (
+    round_to_nearest_resolution_acceptable_by_vae(
+        downscaled_height, downscaled_width
+    )
+)
+
+
+def run_pipe(warmup: bool = False):
+
+    latents = pipe(
+        conditions=None,
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        width=downscaled_width,
+        height=downscaled_height,
+        num_frames=num_frames,
+        num_inference_steps=30 if not warmup else 4,
+        generator=torch.Generator("cpu").manual_seed(0),
+        output_type="latent",
+    ).frames
+
+    # Part 2. Upscale generated video using latent upsampler with fewer inference steps
+    # The available latent upsampler upscales the height/width by 2x
+    upscaled_height, upscaled_width = (
+        downscaled_height * 2,
+        downscaled_width * 2,
+    )
+    upscaled_latents = pipe_upsample(
+        latents=latents, output_type="latent"
+    ).frames
+
+    if warmup:
+        return None
+
+    # Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
+    video = pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        width=upscaled_width,
+        height=upscaled_height,
+        num_frames=num_frames,
+        denoise_strength=0.4,  # Effectively, 4 inference steps out of 10
+        num_inference_steps=10,
+        latents=upscaled_latents,
+        decode_timestep=0.05,
+        image_cond_noise_scale=0.025,
+        generator=torch.Generator("cpu").manual_seed(0),
+        output_type="pil",
+    ).frames[0]
+    return video
+
+
+# warmup
+_ = run_pipe(warmup=True)
+
+start = time.time()
+video = run_pipe()
+end = time.time()
+stats = cache_dit.summary(pipe)
+
+if rank == 0:
+    # Part 4. Downscale the video to the expected resolution
+    video = [frame.resize((expected_width, expected_height)) for frame in video]
+
+    time_cost = end - start
+    save_path = f"ltx-video.{strify(args, stats)}.mp4"
+    print(f"Time cost: {time_cost:.2f}s")
+    print(f"Saving video to {save_path}")
+    export_to_video(video, save_path, fps=8)
+
+maybe_destroy_distributed()
@@ -8,6 +8,7 @@
 from diffusers import (
     QwenImagePipeline,
     QwenImageTransformer2DModel,
+    AutoencoderKLQwenImage,
 )
 
 from utils import (
@@ -55,8 +56,9 @@
 else:
     pipe.to(device)
 
-# assert isinstance(pipe.vae, AutoencoderKLQwenImage)
-# pipe.vae.enable_tiling()
+if GiB() <= 48 and not enable_quatization:
+    assert isinstance(pipe.vae, AutoencoderKLQwenImage)
+    pipe.vae.enable_tiling()
 
 # Apply cache and context parallelism here
 if args.cache or args.parallel_type is not None:
Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@ def init_flux_pipe(args: argparse.Namespace) -> FluxPipeline:`
`68`	`68`	`DBCacheConfig,`
`69`	`69`	`TaylorSeerCalibratorConfig,`
`70`	`70`	`)`
`71`		`- from cache_dit.cache_factory.patch_functors import FluxPatchFunctor`
	`71`	`+ from cache_dit.caching.patch_functors import FluxPatchFunctor`
`72`	`72`
`73`	`73`	`cache_dit.enable_cache(`
`74`	`74`	`# BlockAdapter & forward pattern`
Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ def main():`
`100`	`100`	`DBCacheConfig,`
`101`	`101`	`TaylorSeerCalibratorConfig,`
`102`	`102`	`)`
`103`		`- from cache_dit.cache_factory.patch_functors import FluxPatchFunctor`
	`103`	`+ from cache_dit.caching.patch_functors import FluxPatchFunctor`
`104`	`104`
`105`	`105`	`if args.cache_config is None:`
`106`	`106`