@@ -125,17 +125,7 @@ async def async_fetch_image(image_url: str,
125
125
return image .convert (image_mode )
126
126
127
127
128
- def _load_video_frames_from_bytes (b : bytes ):
129
- frame = Image .open (BytesIO (b ))
130
- return np .array (frame )
131
-
132
-
133
- def load_video_frames_from_base64 (frame : Union [bytes , str ]):
134
- """Load frame from base64 format."""
135
- return _load_video_frames_from_bytes (base64 .b64decode (frame ))
136
-
137
-
138
- def _load_video_from_bytes (b : bytes , num_frames : int = 32 ):
128
+ def _load_video_from_bytes (b : bytes , num_frames : int = 32 ) -> npt .NDArray :
139
129
_ , decord = try_import_video_packages ()
140
130
141
131
video_path = BytesIO (b )
@@ -155,13 +145,17 @@ def _load_video_from_bytes(b: bytes, num_frames: int = 32):
155
145
return frames
156
146
157
147
158
- def _load_video_from_data_url (video_url : str ):
159
- # Only split once and assume the second part is the base64 encoded image
160
- frames_base64 = video_url .split ("," )[1 :]
161
- return np .stack ([
162
- load_video_frames_from_base64 (frame_base64 )
163
- for frame_base64 in frames_base64
164
- ])
148
+ def _load_video_from_data_url (video_url : str ) -> npt .NDArray :
149
+ # Only split once and assume the second part is the base64 encoded video
150
+ _ , video_base64 = video_url .split ("," , 1 )
151
+
152
+ if video_url .startswith ("data:video/jpeg;" ):
153
+ return np .stack ([
154
+ np .array (load_image_from_base64 (frame_base64 ))
155
+ for frame_base64 in video_base64 .split ("," )
156
+ ])
157
+
158
+ return load_video_from_base64 (video_base64 )
165
159
166
160
167
161
def fetch_video (video_url : str , * , num_frames : int = 32 ) -> npt .NDArray :
@@ -342,7 +336,7 @@ def rescale_image_size(image: Image.Image,
342
336
return image
343
337
344
338
345
- def try_import_video_packages () -> Any :
339
+ def try_import_video_packages ():
346
340
try :
347
341
import cv2
348
342
import decord
@@ -384,7 +378,7 @@ def sample_frames_from_video(frames: npt.NDArray,
384
378
return sampled_frames
385
379
386
380
387
- def encode_video_base64 (frames : npt .NDArray ):
381
+ def encode_video_base64 (frames : npt .NDArray ) -> str :
388
382
base64_frames = []
389
383
frames_list = [frames [i ] for i in range (frames .shape [0 ])]
390
384
for frame in frames_list :
@@ -393,6 +387,11 @@ def encode_video_base64(frames: npt.NDArray):
393
387
return "," .join (base64_frames )
394
388
395
389
390
+ def load_video_from_base64 (video : Union [bytes , str ]) -> npt .NDArray :
391
+ """Load video from base64 format."""
392
+ return _load_video_from_bytes (base64 .b64decode (video ))
393
+
394
+
396
395
def resolve_visual_encoder_outputs (
397
396
encoder_outputs : Union [torch .Tensor , list [torch .Tensor ]],
398
397
feature_sample_layers : Optional [list [int ]],
0 commit comments