hao-ai-lab · SolitaryThinker · Apr 7, 2025 · Apr 6, 2025 · Apr 6, 2025 · Apr 6, 2025
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -117,7 +117,6 @@ jobs:
           --volume-size 100
           --test-command "pip install -e .[test] &&
           pip install flash-attn==2.7.0.post2 --no-build-isolation &&
-          python scripts/huggingface/download_hf.py --repo_id=FastVideo/FastHunyuan-diffusers --local_dir=data/FastHunyuan-diffusers --repo_type=model &&
           pytest ./fastvideo/v1/tests/ssim -vs"
 
   runpod-cleanup:

diff --git a/.gitignore b/.gitignore
@@ -10,7 +10,7 @@ wandb/
 *.jpg
 *.safetensors
 *.mp4
-!fastvideo/v1/tests/ssim/reference_videos/*.mp4
+!fastvideo/v1/tests/ssim/reference_videos/**/*.mp4
 *.png
 *.gif
 *.pth

diff --git a/fastvideo/v1/default_configs/v1_inference_hunyuan_config.yaml b/fastvideo/v1/default_configs/v1_inference_hunyuan_config.yaml
@@ -1,9 +1,6 @@
 num_gpus: 4
 model_path: FastVideo/FastHunyuan-diffusers
 master_port: 29503
-use-v1-transformer: True
-use-v1-vae: True
-use-v1-text-encoder: True
 sp_size: 4 
 tp_size: 4 
 height: 720 

diff --git a/fastvideo/v1/inference_args.py b/fastvideo/v1/inference_args.py
@@ -85,21 +85,6 @@ def __post_init__(self):
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
-        parser.add_argument(
-            "--use-v1-text-encoder",
-            action="store_true",
-            help="Use the v1 text encoder",
-        )
-        parser.add_argument(
-            "--use-v1-vae",
-            action="store_true",
-            help="Use the v1 vae",
-        )
-        parser.add_argument(
-            "--use-v1-transformer",
-            action="store_true",
-            help="Use the v1 transformer",
-        )
         # Model and path configuration
         parser.add_argument(
             "--model-path",

diff --git a/fastvideo/v1/tests/ssim/README.md b/fastvideo/v1/tests/ssim/README.md
@@ -1,6 +1,7 @@
 The reference videos in the `reference_videos` directory are used as part of an e2e test to ensure consistency in video generation quality across code changes. `test_inference_similarity.py` compares newly generated videos against these references using Structural Similarity Index (SSIM) metrics to detect any regressions in visual quality across code changes.
 
-The reference videos were generated on commit `66107fd5b8469fed25972feb632cd48887dac451` of the FastVideo codebase.
+`reference_videos/FLASH_ATTN/` videos were generated on commit `66107fd5b8469fed25972feb632cd48887dac451`.
+`reference_videos/TORCH_SDPA/` videos were generated on commit `4ea008b8a16d7f5678a44b187ebdd7d9d0416ff1`.
 
 ## Generation Details
 
@@ -22,9 +23,6 @@ The reference videos were generated on commit `66107fd5b8469fed25972feb632cd4888
 "sp_size": 2,
 "tp_size": 2,
 "vae_sp": true,
-"use_v1_transformer": true,
-"use_v1_vae": true,
-"use_v1_text_encoder": true,
 "fps": 24
 }
 

diff --git a/...the vast horizon. The rugged landscap.mp4 → ...the vast horizon. The rugged landscap.mp4 b/...the vast horizon. The rugged landscap.mp4 → ...the vast horizon. The rugged landscap.mp4
diff --git a/...ing with the energetic background of .mp4 → ...ing with the energetic background of .mp4 b/...ing with the energetic background of .mp4 → ...ing with the energetic background of .mp4
diff --git a/...ands atop a towering cliff, silhouetted against the vast horizon. The rugged landscap.mp4 b/...ands atop a towering cliff, silhouetted against the vast horizon. The rugged landscap.mp4
diff --git a/...ally eats noodles, his relaxed demeanor contrasting with the energetic background of .mp4 b/...ally eats noodles, his relaxed demeanor contrasting with the energetic background of .mp4
diff --git a/fastvideo/v1/tests/ssim/test_inference_similarity.py b/fastvideo/v1/tests/ssim/test_inference_similarity.py
@@ -10,7 +10,7 @@
 # Base parameters from the shell script
 BASE_PARAMS = {
     "num_gpus": 2,
-    "model_path": "data/FastHunyuan-diffusers",
+    "model_path": "FastVideo/FastHunyuan-diffusers",
     "height": 720,
     "width": 1280,
     "num_frames": 45,
@@ -22,9 +22,6 @@
     "sp_size": 2,
     "tp_size": 2,
     "vae_sp": True,
-    "use_v1_transformer": True,
-    "use_v1_vae": True,
-    "use_v1_text_encoder": True,
     "fps": 24,
 }
 
@@ -75,16 +72,18 @@ def write_ssim_results(output_dir, ssim_values, reference_path, generated_path,
 
 @pytest.mark.parametrize("num_inference_steps", [6])
 @pytest.mark.parametrize("prompt", TEST_PROMPTS)
-def test_inference_similarity(num_inference_steps, prompt):
+@pytest.mark.parametrize("ATTENTION_BACKEND", ["FLASH_ATTN", "TORCH_SDPA"])
+def test_inference_similarity(num_inference_steps, prompt, ATTENTION_BACKEND):
     """
     Test that runs inference with different parameters and compares the output
     to reference videos using SSIM.
     """
+    os.environ["FASTVIDEO_ATTENTION_BACKEND"] = ATTENTION_BACKEND
+
     script_dir = os.path.dirname(os.path.abspath(__file__))
 
     base_output_dir = os.path.join(script_dir, 'generated_videos')
-    output_dir = os.path.join(base_output_dir,
-                              f'num_inference_steps={num_inference_steps}')
+    output_dir = os.path.join(base_output_dir, ATTENTION_BACKEND)
     output_video_name = f"{prompt[:100]}.mp4"
 
     os.makedirs(output_dir, exist_ok=True)
@@ -120,12 +119,6 @@ def test_inference_similarity(num_inference_steps, prompt):
         str(BASE_PARAMS["fps"]),
     ]
 
-    if BASE_PARAMS["use_v1_transformer"]:
-        launch_args.append("--use-v1-transformer")
-    if BASE_PARAMS["use_v1_vae"]:
-        launch_args.append("--use-v1-vae")
-    if BASE_PARAMS["use_v1_text_encoder"]:
-        launch_args.append("--use-v1-text-encoder")
     if BASE_PARAMS["vae_sp"]:
         launch_args.append("--vae-sp")
 
@@ -134,7 +127,7 @@ def test_inference_similarity(num_inference_steps, prompt):
     assert os.path.exists(
         output_dir), f"Output video was not generated at {output_dir}"
 
-    reference_folder = os.path.join(script_dir, 'reference_videos')
+    reference_folder = os.path.join(script_dir, 'reference_videos', ATTENTION_BACKEND)
 
     if not os.path.exists(reference_folder):
         logger.error("Reference folder missing")
@@ -149,7 +142,7 @@ def test_inference_similarity(num_inference_steps, prompt):
             break
 
     if not reference_video_name:
-        logger.error(f"Reference video not found for prompt: {prompt}")
+        logger.error(f"Reference video not found for prompt: {prompt} with backend: {ATTENTION_BACKEND}")
         raise FileNotFoundError(f"Reference video missing")
 
     reference_video_path = os.path.join(reference_folder, reference_video_name)

diff --git a/scripts/inference/v1_inference_hunyuan.sh b/scripts/inference/v1_inference_hunyuan.sh
@@ -9,9 +9,6 @@ export MODEL_BASE=FastVideo/FastHunyuan-diffusers
 # dit model and tp_size is used for encoder models.
 torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
     fastvideo/v1/sample/v1_fastvideo_inference.py \
-    --use-v1-transformer \
-    --use-v1-vae \
-    --use-v1-text-encoder \
     --sp_size 4 \
     --tp_size 4 \
     --height 720 \

diff --git a/scripts/inference/v1_inference_hunyuan_STA.sh b/scripts/inference/v1_inference_hunyuan_STA.sh
@@ -10,9 +10,6 @@ export MODEL_BASE=FastVideo/FastHunyuan-diffusers
 # dit model and tp_size is used for encoder models.
 torchrun --nnodes=1 --nproc_per_node=$num_gpus --master_port 29503 \
     fastvideo/v1/sample/v1_fastvideo_inference.py \
-    --use-v1-transformer \
-    --use-v1-vae \
-    --use-v1-text-encoder \
     --sp_size 1 \
     --tp_size 1 \
     --height 768 \
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,7 +10,7 @@ wandb/ @@
     *.jpg
     *.safetensors
     *.mp4
-    !fastvideo/v1/tests/ssim/reference_videos/*.mp4
+    !fastvideo/v1/tests/ssim/reference_videos/**/*.mp4
     *.png
     *.gif
     *.pth
@@ Expand Down @@