@@ -288,7 +288,7 @@ def f32_pcm(wav: torch.Tensor) -> torch.Tensor:
288288 return wav .float () / (2 ** 31 )
289289 raise ValueError (f"Unsupported wav dtype: { wav .dtype } " )
290290
291- def load (filepath : str , frame_offset : int = 0 , num_frames : int = - 1 ) -> tuple [torch .Tensor , int ]:
291+ def load (filepath : str ) -> tuple [torch .Tensor , int ]:
292292 with av .open (filepath ) as af :
293293 if not af .streams .audio :
294294 raise ValueError ("No audio stream found in the file." )
@@ -297,40 +297,20 @@ def load(filepath: str, frame_offset: int = 0, num_frames: int = -1) -> tuple[to
297297 sr = stream .codec_context .sample_rate
298298 n_channels = stream .channels
299299
300- seek_time = frame_offset / sr if frame_offset > 0 else 0.0
301- duration = num_frames / sr if num_frames > 0 else - 1.0
302-
303- sample_offset = int (sr * seek_time )
304- num_samples = int (sr * duration ) if duration >= 0 else - 1
305-
306- # Small negative offset for MP3 artifacts, NOTE: this is LLM code so idk if it's actually necessary'
307- seek_sec = max (0 , seek_time - 0.1 ) if filepath .lower ().endswith ('.mp3' ) else seek_time
308- af .seek (int (seek_sec / stream .time_base ), stream = stream )
309-
310300 frames = []
311301 length = 0
312302 for frame in af .decode (streams = stream .index ):
313- current_offset = int (frame .rate * frame .pts * frame .time_base )
314- strip = max (0 , sample_offset - current_offset )
315-
316303 buf = torch .from_numpy (frame .to_ndarray ())
317304 if buf .shape [0 ] != n_channels :
318305 buf = buf .view (- 1 , n_channels ).t ()
319306
320- buf = buf [:, strip :]
321307 frames .append (buf )
322308 length += buf .shape [1 ]
323309
324- if num_samples > 0 and length >= num_samples :
325- break
326-
327310 if not frames :
328311 raise ValueError ("No audio frames decoded." )
329312
330313 wav = torch .cat (frames , dim = 1 )
331- if num_samples > 0 :
332- wav = wav [:, :num_samples ]
333-
334314 wav = f32_pcm (wav )
335315 return wav , sr
336316
0 commit comments