hao-ai-lab · SolitaryThinker · Apr 4, 2025 · Apr 2, 2025 · Apr 4, 2025
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
@@ -38,7 +38,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -r requirements-lint.txt
+          pip install -e ".[lint]"
       - name: Spelling check with codespell
         run: |
           # Refer to the above environment variable here

diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
@@ -41,7 +41,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -r requirements-lint.txt
+          pip install -e ".[lint]"
       - name: Analysing the code with ruff
         run: |
           ruff check .

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -23,11 +23,9 @@ jobs:
           python -m pip install --upgrade pip setuptools wheel
           pip install torch
           pip install packaging ninja
-          # remove st-attn dependency because no cuda environment
-          sed -i '/st_attn/d' pyproject.toml
           pip install -e .
           pip install pytest
 
       - name: Run Pytest
         run: |
-          pytest --ignore csrc/sliding_tile_attention/test
+          pytest --ignore csrc/sliding_tile_attention/test
diff --git a/README.md b/README.md
@@ -6,8 +6,8 @@ FastVideo is a lightweight framework for accelerating large video diffusion mode
 
 
 <p align="center">
-    🤗 <a href="https://huggingface.co/FastVideo/FastHunyuan"  target="_blank">FastHunyuan</a>  | 🤗 <a href="https://huggingface.co/FastVideo/FastMochi-diffusers" target="_blank">FastMochi</a> | 🟣💬 <a href="https://join.slack.com/t/fastvideo/shared_invite/zt-2zf6ru791-sRwI9lPIUJQq1mIeB_yjJg" target="_blank"> Slack </a> 
-</p> 
+    🤗 <a href="https://huggingface.co/FastVideo/FastHunyuan"  target="_blank">FastHunyuan</a>  | 🤗 <a href="https://huggingface.co/FastVideo/FastMochi-diffusers" target="_blank">FastMochi</a> | 🟣💬 <a href="https://join.slack.com/t/fastvideo/shared_invite/zt-2zf6ru791-sRwI9lPIUJQq1mIeB_yjJg" target="_blank"> Slack </a>
+</p>
 
 
 
@@ -38,18 +38,25 @@ Dev in progress and highly experimental.
 - ```2024/12/17```: `FastVideo` v1.0 is released.
 
 
-## 🔧 Installation
+## 🔧 Installation from source
 The code is tested on Python 3.10.0, CUDA 12.4 and H100.
 ```
-./env_setup.sh fastvideo
+# Clone FastVideo
+git clone https://github.com/hao-ai-lab/FastVideo.git && cd FastVideo
+
+# Install FastVideo
+pip install -e .
+
+# Install Flash Attention (optional)
+pip install flash-attn==2.7.0.post2
 ```
 To try Sliding Tile Attention (optional), please follow the instruction in [csrc/sliding_tile_attention/README.md](csrc/sliding_tile_attention/README.md) to install STA.
 
 ## 🚀 Inference
-### Inference StepVideo with Sliding Tile Attention 
+### Inference StepVideo with Sliding Tile Attention
 First, download the model:
 ```
-python scripts/huggingface/download_hf.py --repo_id=stepfun-ai/stepvideo-t2v --local_dir=data/stepvideo-t2v --repo_type=model 
+python scripts/huggingface/download_hf.py --repo_id=stepfun-ai/stepvideo-t2v --local_dir=data/stepvideo-t2v --repo_type=model
 ```
 Use the following scripts to run inference for StepVideo. When using STA for inference, the generated videos will have dimensions of 204×768×768 (currently, this is the only supported shape).
 ```bash
@@ -60,7 +67,7 @@ sh scripts/inference/inference_stepvideo.sh # Inference original stepvideo
 ### Inference HunyuanVideo with Sliding Tile Attention
 First, download the model:
 ```bash
-python scripts/huggingface/download_hf.py --repo_id=FastVideo/hunyuan --local_dir=data/hunyuan --repo_type=model 
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/hunyuan --local_dir=data/hunyuan --repo_type=model
 ```
 We provide two examples in the following script to run inference with STA + [TeaCache](https://github.com/ali-vilab/TeaCache) and STA only.
 ```bash
@@ -85,7 +92,7 @@ For more information about the VRAM requirements for BitsAndBytes quantization,
 | BF16 + Pipeline CPU Offload    | 23.883G                   | 33.744G                                    | 81s            | 121.5s          |
 | INT8 + Pipeline CPU Offload    | 13.911G                   | 27.979G                                    | 88s            | 116.7s          |
 | NF4 + Pipeline CPU Offload     | 9.453G                    | 19.26G                                     | 78s            | 114.5s          |
-           
+
 
 
 For improved quality in generated videos, we recommend using a GPU with 80GB of memory to run the BF16 model with the original Hunyuan pipeline. To execute the inference, use the following section:
@@ -140,28 +147,28 @@ Then you can run the finetune with:
 bash scripts/finetune/finetune_mochi.sh # for mochi
 ```
 **Note that for finetuning, we did not tune the hyperparameters in the provided script.**
-### ⚡ Lora Finetune 
+### ⚡ Lora Finetune
 
 Hunyuan supports Lora fine-tuning of videos up to 720p. Demos and prompts of Black-Myth-Wukong can be found in [here](https://huggingface.co/FastVideo/Hunyuan-Black-Myth-Wukong-lora-weight). You can download the Lora weight through:
 ```bash
 python scripts/huggingface/download_hf.py --repo_id=FastVideo/Hunyuan-Black-Myth-Wukong-lora-weight --local_dir=data/Hunyuan-Black-Myth-Wukong-lora-weight --repo_type=model
 ```
 #### Minimum Hardware Requirement
 - 40 GB GPU memory each for 2 GPUs with lora.
-- 30 GB GPU memory each for 2 GPUs with CPU offload and lora.  
+- 30 GB GPU memory each for 2 GPUs with CPU offload and lora.
 
 
 Currently, both Mochi and Hunyuan models support Lora finetuning through diffusers. To generate personalized videos from your own dataset, you'll need to follow three main steps: dataset preparation, finetuning, and inference.
 
 #### Dataset Preparation
-We provide scripts to better help you get started to train on your own characters!  
+We provide scripts to better help you get started to train on your own characters!
 You can run this to organize your dataset to get the videos2caption.json before preprocess. Specify your video folder and corresponding caption folder (caption files should be .txt files and have the same name with its video):
 ```
 python scripts/dataset_preparation/prepare_json_file.py --video_dir data/input_videos/ --prompt_dir data/captions/ --output_path data/output_folder/videos2caption.json --verbose
 ```
 Also, we provide script to resize your videos:
 ```
-python scripts/data_preprocess/resize_videos.py 
+python scripts/data_preprocess/resize_videos.py
 ```
 #### Finetuning
 After basic dataset preparation and preprocess, you can start to finetune your model using Lora:
@@ -171,12 +178,12 @@ bash scripts/finetune/finetune_hunyuan_hf_lora.sh
 #### Inference
 For inference with Lora checkpoint, you can run the following scripts with additional parameter `--lora_checkpoint_dir`:
 ```
-bash scripts/inference/inference_hunyuan_hf.sh 
+bash scripts/inference/inference_hunyuan_hf.sh
 ```
 **We also provide scripts for Mochi in the same directory.**
 
 #### Finetune with Both Image and Video
-Our codebase support finetuning with both image and video. 
+Our codebase support finetuning with both image and video.
 ```bash
 bash scripts/finetune/finetune_hunyuan.sh
 bash scripts/finetune/finetune_mochi_lora_mix.sh
@@ -205,26 +212,26 @@ We learned and reused code from the following projects: [PCM](https://github.com
 
 We thank MBZUAI and Anyscale for their support throughout this project.
 
-## Citation 
+## Citation
 If you use FastVideo for your research, please cite our paper:
 
 ```bibtex
 @misc{zhang2025fastvideogenerationsliding,
-      title={Fast Video Generation with Sliding Tile Attention}, 
+      title={Fast Video Generation with Sliding Tile Attention},
       author={Peiyuan Zhang and Yongqi Chen and Runlong Su and Hangliang Ding and Ion Stoica and Zhenghong Liu and Hao Zhang},
       year={2025},
       eprint={2502.04507},
       archivePrefix={arXiv},
       primaryClass={cs.CV},
-      url={https://arxiv.org/abs/2502.04507}, 
+      url={https://arxiv.org/abs/2502.04507},
 }
 @misc{ding2025efficientvditefficientvideodiffusion,
-      title={Efficient-vDiT: Efficient Video Diffusion Transformers With Attention Tile}, 
+      title={Efficient-vDiT: Efficient Video Diffusion Transformers With Attention Tile},
       author={Hangliang Ding and Dacheng Li and Runlong Su and Peiyuan Zhang and Zhijie Deng and Ion Stoica and Hao Zhang},
       year={2025},
       eprint={2502.06155},
       archivePrefix={arXiv},
       primaryClass={cs.CV},
-      url={https://arxiv.org/abs/2502.06155}, 
+      url={https://arxiv.org/abs/2502.06155},
 }
 ```
diff --git a/env_setup.sh b/env_setup.sh
diff --git a/fastvideo/v1/pipelines/stages/denoising.py b/fastvideo/v1/pipelines/stages/denoising.py
@@ -138,10 +138,16 @@ def dict_to_3d_list(mask_strategy, t_max=50, l_max=60, h_max=24):
                         dtype=torch.float16,  # TODO(will): hack
                         distributed=True,
                     )
-                    from fastvideo.v1.attention.backends.sliding_tile_attn import (
-                        SlidingTileAttentionBackend)
-                    if isinstance(self.attn_backend,
-                                  SlidingTileAttentionBackend):
+
+                    # TODO(will): clean this up...
+                    try:
+                        from fastvideo.v1.attention.backends.sliding_tile_attn import (
+                            SlidingTileAttentionBackend)
+                    except ImportError:
+                        SlidingTileAttentionBackend = None
+
+                    if SlidingTileAttentionBackend is not None and isinstance(
+                            self.attn_backend, SlidingTileAttentionBackend):
                         self.attn_metadata_builder_cls = self.attn_backend.get_builder_cls(
                         )
                         if self.attn_metadata_builder_cls is not None:

diff --git a/fastvideo/v1/platforms/cuda.py b/fastvideo/v1/platforms/cuda.py
@@ -175,12 +175,10 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                 target_backend = _Backend.TORCH_SDPA
 
         if target_backend == _Backend.TORCH_SDPA:
-            # TODO(will): Implement torch SDPA backend.
-            raise NotImplementedError("Torch SDPA is not implemented yet.")
             logger.info(
                 "Using torch.nn.functional.scaled_dot_product_attention backend."
             )
-            return "fastvideo.v1.attention.backends.torch_sdpa.TorchSDPA"
+            return "fastvideo.v1.attention.backends.sdpa.SDPABackend"
 
         logger.info("Using Flash Attention backend.")
         return "fastvideo.v1.attention.backends.flash_attn.FlashAttentionBackend"

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,37 +15,67 @@ classifiers = [
 
 dependencies = [
     # Core Libraries
-    "scipy==1.14.1", "six==1.16.0", "h5py==3.12.1", 
-    
+    "scipy==1.14.1", "six==1.16.0", "h5py==3.12.1",
+
     # Machine Learning & Transformers
-    "transformers==4.46.1", "tokenizers==0.20.1", "sentencepiece==0.2.0",
+    "transformers>=4.46.1", "tokenizers>=0.20.1", "sentencepiece==0.2.0",
     "timm==1.0.11", "peft==0.13.2", "diffusers==0.32.0", "bitsandbytes",
+    "torch==2.5.1", "torchvision",
+
+    # vLLM
+    "vllm>=0.7.3",
 
     # Acceleration & Optimization
-    "accelerate==1.0.1", "torch", 
-    
+    "accelerate==1.0.1", "torch",
+
     # Computer Vision & Image Processing
-    "opencv-python==4.10.0.84", "pillow==10.2.0", "imageio==2.36.0", 
+    "opencv-python==4.10.0.84", "pillow>=10.3.0", "imageio==2.36.0",
     "imageio-ffmpeg==0.5.1", "decord==0.6.0", "einops",
-    
+
     # Experiment Tracking & Logging
     "wandb==0.18.5", "loguru", "test-tube==0.7.5",
-    
+
     # Miscellaneous Utilities
     "tqdm==4.66.5", "PyYAML==6.0.1", "idna==3.6", "protobuf==5.28.3",
     "gradio==5.3.0", "huggingface_hub==0.26.1", "moviepy==1.0.3", "flask",
     "flask_restful", "aiohttp", "huggingface_hub", "cloudpickle",
-    
+
     # System & Monitoring Tools
     "gpustat", "watch",
-    
+
     # Kernel & Packaging
     "wheel",
+]
+
+[project.optional-dependencies]
 
-    # Sliding Tile Atteniton Kernel
-    "st_attn>=0.0.1"
+# flash-attn: pip install flash-attn==2.7.0.post2 --no-build-isolation 
+
+lint = [
+    # formatting
+    "yapf==0.32.0",
+    "toml==0.10.2",
+    "tomli==2.0.2",
+    "ruff==0.6.5",
+    "codespell==2.3.0",
+    "isort==5.13.2",
+    "sphinx-lint==1.0.0",
+
+    # type checking
+    "mypy==1.11.1",
+    "types-PyYAML",
+    "types-requests",
+    "types-setuptools",
 ]
 
+test = [ 
+    "av",
+    "pytorch-msssim",
+    "pytest",
+]
+
+dev = [ "fastvideo[lint]", "fastvideo[test]", ]
+
 [project.scripts]
 fastvideo = "fastvideo.v1.entrypoints.cli.main:main"
 
@@ -115,4 +145,4 @@ column_limit = 80
 [tool.isort]
 line_length = 80
 use_parentheses = true
-skip_gitignore = true
+skip_gitignore = true
diff --git a/requirements-lint.txt b/requirements-lint.txt