@@ -126,31 +126,29 @@ class Glm4vVideoPixelInputs(TensorSchema):
126126 - np: Number of patches
127127 - ctpp: Number of channels * temporal_patch_size *
128128 patch_size * patch_size
129- - nv: Number of videos
130129 - f: Number of frames
131130 - g: Grid dimensions (3 for grid_t which is usually 1 for processed
132131 video, grid_h, grid_w)
133132 """
134133 type : Literal ["pixel_values_videos" ] = "pixel_values_videos"
135134
136135 pixel_values_videos : Annotated [torch .Tensor , TensorShape ("np" , "ctpp" )]
137- # video_metadata: Union[list[VideoMetadata], list[dict]]
138- video_grid_thw : Annotated [torch .Tensor , TensorShape ("nv" , "f" , 3 )]
136+ video_grid_thw : Annotated [torch .Tensor , TensorShape ("f" , 3 )]
139137
140138
141139class Glm4vVideoEmbeddingInputs (TensorSchema ):
142140 """
143141 Dimensions:
144142 - p: Number of video patches across all frames
145143 - h: Hidden size (must match language model backbone)
146- - n : Number of videos
144+ - f : Number of frames
147145 - g: Grid dimensions (3 for grid_t which is usually 1 for processed
148146 video, grid_h, grid_w)
149147 """
150148 type : Literal ["video_embeds" ] = "video_embeds"
151149
152150 video_embeds : Annotated [torch .Tensor , TensorShape ("p" , "h" )]
153- video_grid_thw : Annotated [torch .Tensor , TensorShape ("n" , 1 , 3 )]
151+ video_grid_thw : Annotated [torch .Tensor , TensorShape ("f" , 3 )]
154152
155153
156154Glm4vVideoInputs = Union [Glm4vVideoPixelInputs , Glm4vVideoEmbeddingInputs ]
@@ -1348,7 +1346,6 @@ def _parse_and_validate_video_input(
13481346
13491347 return Glm4vVideoPixelInputs (
13501348 type = "pixel_values_videos" ,
1351- # video_metadata=video_metadata,
13521349 pixel_values_videos = pixel_values_videos ,
13531350 video_grid_thw = video_grid_thw ,
13541351 )
0 commit comments