From 9e5da2cf6a020fd48e6484bd89338ae10024d54b Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Fri, 12 Sep 2025 15:30:09 -0700
Subject: [PATCH 01/17] Update datasets_utils.py

Patch for token captions
---
 src/data/datasets_utils.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)
diff --git a/src/data/datasets_utils.py b/src/data/datasets_utils.py
index afb5a1f..78768b1 100644
--- a/src/data/datasets_utils.py
+++ b/src/data/datasets_utils.py
@@ -27,11 +27,27 @@ def tokenize_captions(
     captions = []
     if "prompt" in examples.keys():
         captions = examples["prompt"]
+    # else:
+    #     for example in examples["image"]:
+    #         path = example.filename
+    #         filename = os.path.splitext(os.path.basename(path))[0]
+    #         caption = filename.replace("_", " ")
+    #         captions.append(caption)
     else:
-        for example in examples["image"]:
-            path = example.filename
-            filename = os.path.splitext(os.path.basename(path))[0]
-            caption = filename.replace("_", " ")
+        for i, img in enumerate(examples["image"]):
+            # try several likely places for a path
+            path = getattr(img, "filename", None)  # PIL Image opened from disk
+            if path is None and isinstance(img, dict):
+                path = img.get("path")             # HF datasets when decode=False
+            if path is None and "image_path" in examples:
+                path = examples["image_path"][i]   # custom parallel column if you have one
+    
+            if path:
+                filename = os.path.splitext(os.path.basename(path))[0]
+                caption = filename.replace("_", " ")
+            else:
+                caption = f"image_{i}"             
+    
             captions.append(caption)
 
     inputs = tokenizer(captions)

From f17bc7238c141554c6e3f1ee83ccbddfbaf2d1b1 Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Fri, 12 Sep 2025 16:14:27 -0700
Subject: [PATCH 02/17] Create stable_diffusion.yaml

Add stable diffusion model
---
 config/stable_diffusion.yaml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 config/stable_diffusion.yaml

diff --git a/config/stable_diffusion.yaml b/config/stable_diffusion.yaml
new file mode 100644
index 0000000..48f57dc
--- /dev/null
+++ b/config/stable_diffusion.yaml
@@ -0,0 +1,20 @@
+defaults:
+  - launcher: defaults
+  - accelerate_config: fsdp_config
+  - train_args: stable-diffusion-xl
+  - _self_
+
+accelerate_config:
+  dynamo_config:
+    dynamo_backend: ["no", "inductor"]
+
+train_args:
+  train_batch_size: [1, 10, 17]
+  num_iterations: 20
+  logging_dir: outputs/stable_diffusion_xl
+
+# Override Hydra's run dir to be the same as logging dir. Not setting this may result
+# in errors or unexpected behavior because Hydra by default a run dir `./<date><time>`.
+hydra:
+  run:
+    dir: ${train_args.logging_dir}

From 0ca0ce1ac4932112f1e4a6faaf47216ed02e8d28 Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Thu, 18 Sep 2025 10:58:10 -0700
Subject: [PATCH 03/17] Update requirements.txt

Updated requirements for PyTorch 25.9 RC
---
 requirements.txt | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d683761..0f968c2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,27 +1,5 @@
-accelerate==1.3.0
-black==25.1.0
-black[jupyter]==25.1.0
-csvkit==2.0.1
-datasets==3.2.0
-decord==0.6.0
-deepspeed==0.16.3
 diffusers==0.33.1
 flatten-dict==0.4.2
-ftfy==6.3.1
-GitPython==3.1.44
-huggingface_hub[cli]==0.30.0
 hydra-core==1.3.2
-imageio-ffmpeg==0.6.0
-isort==6.0.1
-opencv-python-headless==4.10.0.84
-peft==0.15.2
-pre_commit==4.1.0
-protobuf==5.29.2
-pytest==8.3.4
 python-dotenv==1.0.1
-seaborn==0.13.2
-sentencepiece==0.2.0
-tensorboard==2.18.0
-transformers==4.50.0
 xarray[parallel]==2025.3.0
-zarr==2.18.3

From 781e037d933b94ba1fff15edaed267813b955454 Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Thu, 18 Sep 2025 10:59:09 -0700
Subject: [PATCH 04/17] Update flux-dev.yaml

Updated dataset information for flux
---
 config/train_args/flux-dev.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/train_args/flux-dev.yaml b/config/train_args/flux-dev.yaml
index 4af7767..af07c97 100644
--- a/config/train_args/flux-dev.yaml
+++ b/config/train_args/flux-dev.yaml
@@ -4,7 +4,7 @@ defaults:
 
 model: flux-dev
 model_path: black-forest-labs/FLUX.1-dev
-train_data_path: bghira/pseudo-camera-10k
+train_data_path: frank-chieng/chinese_architecture_siheyuan
 train_batch_size: 10
 num_iterations: 100
 resolution: 512

From 16a77de9cc4e2d3b4224c716454a1474ea515157 Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Thu, 18 Sep 2025 10:59:34 -0700
Subject: [PATCH 05/17] Update stable-diffusion-xl.yaml

Updated dataset information for stable diffusion
---
 config/train_args/stable-diffusion-xl.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/train_args/stable-diffusion-xl.yaml b/config/train_args/stable-diffusion-xl.yaml
index b7540c5..d779976 100644
--- a/config/train_args/stable-diffusion-xl.yaml
+++ b/config/train_args/stable-diffusion-xl.yaml
@@ -4,7 +4,7 @@ defaults:
 
 model: stable-diffusion-xl
 model_path: stabilityai/stable-diffusion-xl-base-1.0
-train_data_path: bghira/pseudo-camera-10k
+train_data_path: frank-chieng/chinese_architecture_siheyuan
 train_batch_size: 20
 num_iterations: 200
 resolution: 1024

From 7e5605cc5983daa06f780f6ffc42fd7c740626ce Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Thu, 18 Sep 2025 11:00:35 -0700
Subject: [PATCH 06/17] Update trainer.py

Update accelerate import statements to make it compatible with accelerate==1.9.0 (25.9 RC)
---
 src/trainer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/trainer.py b/src/trainer.py
index d83bc45..07d73a3 100644
--- a/src/trainer.py
+++ b/src/trainer.py
@@ -15,7 +15,9 @@
 import torch.optim.optimizer
 import transformers
 from accelerate import Accelerator, ProfileKwargs
-from accelerate.utils import DummyOptim, DummyScheduler, DynamoBackend, set_seed
+# from accelerate.utils import DummyOptim, DummyScheduler, DynamoBackend, set_seed
+from accelerate.utils.deepspeed import DummyOptim, DummyScheduler
+from accelerate.utils import DynamoBackend, set_seed
 from diffusers.optimization import get_scheduler
 from diffusers.utils import export_to_video
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (

From efadf5170cb9e5a84eabe719f26a26414202e2e6 Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Thu, 18 Sep 2025 11:01:38 -0700
Subject: [PATCH 07/17] Update Makefile

Updated dataset download and model downloads for flux and stable diffusion
---
 Makefile | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/Makefile b/Makefile
index 305e585..24f6fac 100644
--- a/Makefile
+++ b/Makefile
@@ -24,8 +24,8 @@ download_assets:
 		fi \
 	fi
 	# download pseudo-camera-10k dataset
-	@echo "\033[1;31mDownloading bghira/pseudo-camera-10k dataset\033[0m"
-	huggingface-cli download --repo-type=dataset bghira/pseudo-camera-10k
+	@echo "\033[1;31mDownloading frank-chieng/chinese_architecture_siheyuan dataset\033[0m"
+	huggingface-cli download --repo-type=dataset frank-chieng/chinese_architecture_siheyuan
 
 	# download FLUX.1-dev model and checkpoints
 	@echo "\033[1;31mDownloading black-forest-labs/FLUX.1-dev\033[0m"
@@ -35,22 +35,6 @@ download_assets:
 	@echo "\033[1;31mDownloading stabilityai/stable-diffusion-xl-base-1.0\033[0m"
 	huggingface-cli download stabilityai/stable-diffusion-xl-base-1.0
 
-	# download Disney-VideoGeneration-Dataset
-	@echo "\033[1;31mDownloading Wild-Heart/Disney-VideoGeneration-Dataset\033[0m"
-	huggingface-cli download --repo-type=dataset Wild-Heart/Disney-VideoGeneration-Dataset
-
-	# download HunyuanVideo model and checkpoints
-	@echo "\033[1;31mDownloading hunyuanvideo-community/HunyuanVideo\033[0m"
-	huggingface-cli download hunyuanvideo-community/HunyuanVideo
-
-	# download Mochi-1 model and checkpoints
-	@echo "\033[1;31mDownloading genmo/mochi-1-preview\033[0m"
-	huggingface-cli download genmo/mochi-1-preview
-
-	# download Wan2.1 model and checkpoints
-	@echo "\033[1;31mDownloading Wan-AI/Wan2.1-I2V-14B-480P-Diffusers\033[0m"
-	huggingface-cli download Wan-AI/Wan2.1-I2V-14B-480P-Diffusers
-
 	@echo "\033[1;31mDownloading completed.\033[0m"
 
 # Target to build the Docker image

From a3493d40ae500f672a23fe6047aa0c31878d3cef Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Tue, 14 Oct 2025 13:55:25 -0700
Subject: [PATCH 08/17] Update requirements.txt

Update library versions to support hunyuan, mochi and wan models
---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 0f968c2..c0a1f38 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,5 @@ flatten-dict==0.4.2
 hydra-core==1.3.2
 python-dotenv==1.0.1
 xarray[parallel]==2025.3.0
+decord==0.6.0
+datasets==3.6.0

From 202b4abaa66e639df6ec1bb2436b495d721d8a4f Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Tue, 14 Oct 2025 13:56:33 -0700
Subject: [PATCH 09/17] Update Makefile

Update Makefile to include video models
---
 Makefile | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/Makefile b/Makefile
index 24f6fac..0cf32c2 100644
--- a/Makefile
+++ b/Makefile
@@ -35,6 +35,22 @@ download_assets:
 	@echo "\033[1;31mDownloading stabilityai/stable-diffusion-xl-base-1.0\033[0m"
 	huggingface-cli download stabilityai/stable-diffusion-xl-base-1.0
 
+	# download Disney-VideoGeneration-Dataset
+	@echo "\033[1;31mDownloading Wild-Heart/Disney-VideoGeneration-Dataset\033[0m"
+	huggingface-cli download --repo-type=dataset Wild-Heart/Disney-VideoGeneration-Dataset
+
+	# download HunyuanVideo model and checkpoints
+	@echo "\033[1;31mDownloading hunyuanvideo-community/HunyuanVideo\033[0m"
+	huggingface-cli download hunyuanvideo-community/HunyuanVideo
+
+	# download Mochi-1 model and checkpoints
+	@echo "\033[1;31mDownloading genmo/mochi-1-preview\033[0m"
+	huggingface-cli download genmo/mochi-1-preview
+
+	# download Wan2.1 model and checkpoints
+	@echo "\033[1;31mDownloading Wan-AI/Wan2.1-I2V-14B-480P-Diffusers\033[0m"
+	huggingface-cli download Wan-AI/Wan2.1-I2V-14B-480P-Diffusers
+
 	@echo "\033[1;31mDownloading completed.\033[0m"
 
 # Target to build the Docker image

From acd9efca09020d93d10cf8f7d12143daf8e9d4aa Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Tue, 14 Oct 2025 14:00:35 -0700
Subject: [PATCH 10/17] Update trainer.py

update code changes for supporting video models.
---
 src/trainer.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/trainer.py b/src/trainer.py
index 07d73a3..36fb09f 100644
--- a/src/trainer.py
+++ b/src/trainer.py
@@ -727,9 +727,24 @@ def run_training(self) -> None:
                     train_loss = 0.0
 
                 postpr_logs = {k: float(f"{v:.4E}") for k, v in logs.items()}
+                
+                # if self.accelerator.is_local_main_process:
+                #     logger.info(f"Step {global_step}: {postpr_logs}")
+                #     self.accelerator.log(logs, step=global_step)
+
+                def _fmt_num(k: str, v):
+                    if not isinstance(v, float):
+                        return v
+                    if k == "lr":
+                        return f"{v:.10f}"   # e.g., 0.0000100000
+                    return f"{v:.6f}"
+
                 if self.accelerator.is_local_main_process:
-                    logger.info(f"Step {global_step}: {postpr_logs}")
-                    self.accelerator.log(logs, step=global_step)
+                    formatted_logs = ", ".join([f"'{k}': {_fmt_num(k, v)}" for k, v in logs.items()])
+                    log_line = f"INFO - Step {global_step} - {{{formatted_logs}}}"
+                    print(log_line, flush=True)
+                    logger.info(log_line)
+                self.accelerator.log(logs, step=global_step)
 
                 # Break if the max number of iterations are exceeded
                 if global_step >= self.args.num_iterations:

From 9fe575fc113ccf5eb71a953e236758e4ca2964f4 Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Tue, 14 Oct 2025 14:27:24 -0700
Subject: [PATCH 11/17] Update wan2_1-i2v.yaml

Add working configs for wan2
---
 config/train_args/wan2_1-i2v.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/train_args/wan2_1-i2v.yaml b/config/train_args/wan2_1-i2v.yaml
index e4a06ca..16b81a5 100644
--- a/config/train_args/wan2_1-i2v.yaml
+++ b/config/train_args/wan2_1-i2v.yaml
@@ -5,8 +5,8 @@ defaults:
 model: wan2_1-i2v
 model_path: Wan-AI/Wan2.1-I2V-14B-480P-Diffusers
 train_data_path: Wild-Heart/Disney-VideoGeneration-Dataset
-train_batch_size: 3
-num_iterations: 30
-resolution: "768,512"
-num_frames: 129
+num_frames: 9
 gradient_checkpointing: 1
+train_batch_size: 1
+num_iterations: 100
+resolution: "960,544"

From c6dfdc4095cb9878255585e61caceab36a150ffc Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Tue, 14 Oct 2025 15:56:48 -0700
Subject: [PATCH 12/17] Update requirements.txt

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index c0a1f38..035e57e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,3 @@ hydra-core==1.3.2
 python-dotenv==1.0.1
 xarray[parallel]==2025.3.0
 decord==0.6.0
-datasets==3.6.0

From 46b9b793b98a861970e5b9e42437fc1ab884d5e5 Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Tue, 14 Oct 2025 16:20:25 -0700
Subject: [PATCH 13/17] Update requirements.txt

---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 035e57e..e2f74e8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,5 @@ hydra-core==1.3.2
 python-dotenv==1.0.1
 xarray[parallel]==2025.3.0
 decord==0.6.0
+datasets==3.2.0
+datasets==3.6.0

From 1663badff8c005ce44875db5ca6d5bb83d8004e8 Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Tue, 14 Oct 2025 16:41:55 -0700
Subject: [PATCH 14/17] Update requirements.txt

clean up all deps
---
 requirements.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e2f74e8..49cbbc9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,5 +4,4 @@ hydra-core==1.3.2
 python-dotenv==1.0.1
 xarray[parallel]==2025.3.0
 decord==0.6.0
-datasets==3.2.0
-datasets==3.6.0
+

From a76ef2db8912c32c9dc7991936a6939324d4bc10 Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Wed, 15 Oct 2025 16:53:23 -0700
Subject: [PATCH 15/17] Update trainer.py

removed debugging statements
---
 src/trainer.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/trainer.py b/src/trainer.py
index 36fb09f..f20a710 100644
--- a/src/trainer.py
+++ b/src/trainer.py
@@ -728,23 +728,23 @@ def run_training(self) -> None:
 
                 postpr_logs = {k: float(f"{v:.4E}") for k, v in logs.items()}
                 
-                # if self.accelerator.is_local_main_process:
-                #     logger.info(f"Step {global_step}: {postpr_logs}")
-                #     self.accelerator.log(logs, step=global_step)
+                if self.accelerator.is_local_main_process:
+                    logger.info(f"Step {global_step}: {postpr_logs}")
+                    self.accelerator.log(logs, step=global_step)
 
-                def _fmt_num(k: str, v):
-                    if not isinstance(v, float):
-                        return v
-                    if k == "lr":
-                        return f"{v:.10f}"   # e.g., 0.0000100000
-                    return f"{v:.6f}"
+                # def _fmt_num(k: str, v):
+                #     if not isinstance(v, float):
+                #         return v
+                #     if k == "lr":
+                #         return f"{v:.10f}"   # e.g., 0.0000100000
+                #     return f"{v:.6f}"
 
-                if self.accelerator.is_local_main_process:
-                    formatted_logs = ", ".join([f"'{k}': {_fmt_num(k, v)}" for k, v in logs.items()])
-                    log_line = f"INFO - Step {global_step} - {{{formatted_logs}}}"
-                    print(log_line, flush=True)
-                    logger.info(log_line)
-                self.accelerator.log(logs, step=global_step)
+                # if self.accelerator.is_local_main_process:
+                #     formatted_logs = ", ".join([f"'{k}': {_fmt_num(k, v)}" for k, v in logs.items()])
+                #     log_line = f"INFO - Step {global_step} - {{{formatted_logs}}}"
+                #     print(log_line, flush=True)
+                #     logger.info(log_line)
+                # self.accelerator.log(logs, step=global_step)
 
                 # Break if the max number of iterations are exceeded
                 if global_step >= self.args.num_iterations:

From a1b2b86148cdd0bd58b7acb05b7c22e7d1370c53 Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Thu, 16 Oct 2025 17:00:20 -0700
Subject: [PATCH 16/17] Update datasets_utils.py

modified preprocess_video function to support video models
---
 src/data/datasets_utils.py | 107 +++++++++++++++++++++++++++----------
 1 file changed, 79 insertions(+), 28 deletions(-)

diff --git a/src/data/datasets_utils.py b/src/data/datasets_utils.py
index 78768b1..ac50ea2 100644
--- a/src/data/datasets_utils.py
+++ b/src/data/datasets_utils.py
@@ -86,6 +86,60 @@ def _preprocess_images(
     return outputs
 
 
+# def _preprocess_videos(
+#     videos: Iterable,
+#     num_frames: int,
+#     train_transforms: callable = None,
+#     image_processor: callable = None,
+# ) -> List[Dict[str, torch.Tensor]]:
+#     outputs: Dict[str, List[torch.Tensor]] = {"pixel_values": []}
+#     if image_processor is not None:
+#         outputs["processed_image"] = []
+
+#     for video in videos:
+#         current_length = len(video)
+
+#         if num_frames > current_length:
+#             if safely_eval_as_bool(os.getenv("PAD_VIDEOS_TO_NUM_FRAMES", "false")):
+#                 # If num_frames is greater than the current length, pad with last frame
+#                 pad_length = num_frames - current_length
+#                 video = video.get_batch(list(range(current_length))).asnumpy()
+#                 video = np.concatenate(
+#                     [
+#                         video,
+#                         np.tile(video[-1], (pad_length, 1, 1, 1)),
+#                     ],
+#                     axis=0,
+#                 )
+#             else:
+#                 raise ValueError(
+#                     f"num_frames={num_frames} is longer than input video length {current_length}"
+#                 )
+#         else:
+#             video = video.get_batch(list(range(num_frames))).asnumpy()
+
+#         if image_processor is not None:
+#             image = video[0]  # first frame of the video
+#             image = image_processor(images=image, return_tensors="pt")["pixel_values"][
+#                 0
+#             ]
+#             outputs["processed_image"].append(image)
+
+#         if train_transforms:
+#             video = torch.stack(
+#                 [
+#                     train_transforms(torchvision.transforms.ToPILImage()(frame))
+#                     for frame in video
+#                 ]
+#             )
+
+#         outputs["pixel_values"].append(video)
+
+#     return outputs
+
+from decord import VideoReader, cpu
+import torch
+
 def _preprocess_videos(
     videos: Iterable,
     num_frames: int,
@@ -97,47 +151,44 @@ def _preprocess_videos(
         outputs["processed_image"] = []
 
     for video in videos:
-        current_length = len(video)
+        if not hasattr(video, "__len__"):
+            if isinstance(video, str):
+                video = VideoReader(video, ctx=cpu(0))
+            else:
+                frames = [f["data"] for f in video]  # TorchVision returns dicts
+                video = torch.stack(frames).permute(0, 2, 3, 1).numpy()
+
+        if hasattr(video, "__len__"):
+            current_length = len(video)
+        else:
+            current_length = video.shape[0]
 
         if num_frames > current_length:
-            if safely_eval_as_bool(os.getenv("PAD_VIDEOS_TO_NUM_FRAMES", "false")):
-                # If num_frames is greater than the current length, pad with last frame
-                pad_length = num_frames - current_length
-                video = video.get_batch(list(range(current_length))).asnumpy()
-                video = np.concatenate(
-                    [
-                        video,
-                        np.tile(video[-1], (pad_length, 1, 1, 1)),
-                    ],
-                    axis=0,
-                )
-            else:
-                raise ValueError(
-                    f"num_frames={num_frames} is longer than input video length {current_length}"
-                )
+            pad_length = num_frames - current_length
+            video = video.get_batch(list(range(current_length))).asnumpy() if isinstance(video, VideoReader) else video
+            video = np.concatenate([video, np.tile(video[-1], (pad_length, 1, 1, 1))], axis=0)
         else:
-            video = video.get_batch(list(range(num_frames))).asnumpy()
+            if isinstance(video, VideoReader):
+                video = video.get_batch(list(range(num_frames))).asnumpy()
+            else:
+                video = video[:num_frames]
 
         if image_processor is not None:
-            image = video[0]  # first frame of the video
-            image = image_processor(images=image, return_tensors="pt")["pixel_values"][
-                0
-            ]
+            image = video[0]
+            image = image_processor(images=image, return_tensors="pt")["pixel_values"][0]
             outputs["processed_image"].append(image)
-
         if train_transforms:
-            video = torch.stack(
-                [
-                    train_transforms(torchvision.transforms.ToPILImage()(frame))
-                    for frame in video
-                ]
-            )
+            video = torch.stack([
+                train_transforms(torchvision.transforms.ToPILImage()(frame))
+                for frame in video
+            ])
 
         outputs["pixel_values"].append(video)
 
     return outputs
 
 
+
 def preprocess_train(
     examples: dict[str, torch.Tensor],
     args: argparse.Namespace,

From a8ec74836c19df10a75bb595e679291b0bfd1e63 Mon Sep 17 00:00:00 2001
From: Kailash Gogineni <gkailashnath1998@gmail.com>
Date: Thu, 6 Nov 2025 15:15:27 -0800
Subject: [PATCH 17/17] Update trainer.py to make a patch for diffusion models
 for 25.10 RC

---
 src/trainer.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/trainer.py b/src/trainer.py
index f20a710..36fb09f 100644
--- a/src/trainer.py
+++ b/src/trainer.py
@@ -728,23 +728,23 @@ def run_training(self) -> None:
 
                 postpr_logs = {k: float(f"{v:.4E}") for k, v in logs.items()}
                 
-                if self.accelerator.is_local_main_process:
-                    logger.info(f"Step {global_step}: {postpr_logs}")
-                    self.accelerator.log(logs, step=global_step)
+                # if self.accelerator.is_local_main_process:
+                #     logger.info(f"Step {global_step}: {postpr_logs}")
+                #     self.accelerator.log(logs, step=global_step)
 
-                # def _fmt_num(k: str, v):
-                #     if not isinstance(v, float):
-                #         return v
-                #     if k == "lr":
-                #         return f"{v:.10f}"   # e.g., 0.0000100000
-                #     return f"{v:.6f}"
+                def _fmt_num(k: str, v):
+                    if not isinstance(v, float):
+                        return v
+                    if k == "lr":
+                        return f"{v:.10f}"   # e.g., 0.0000100000
+                    return f"{v:.6f}"
 
-                # if self.accelerator.is_local_main_process:
-                #     formatted_logs = ", ".join([f"'{k}': {_fmt_num(k, v)}" for k, v in logs.items()])
-                #     log_line = f"INFO - Step {global_step} - {{{formatted_logs}}}"
-                #     print(log_line, flush=True)
-                #     logger.info(log_line)
-                # self.accelerator.log(logs, step=global_step)
+                if self.accelerator.is_local_main_process:
+                    formatted_logs = ", ".join([f"'{k}': {_fmt_num(k, v)}" for k, v in logs.items()])
+                    log_line = f"INFO - Step {global_step} - {{{formatted_logs}}}"
+                    print(log_line, flush=True)
+                    logger.info(log_line)
+                self.accelerator.log(logs, step=global_step)
 
                 # Break if the max number of iterations are exceeded
                 if global_step >= self.args.num_iterations: