forked from MahmoudAshraf97/whisper-diarization
-
Notifications
You must be signed in to change notification settings - Fork 0
/
transcription_helpers.py
74 lines (63 loc) · 2.06 KB
/
transcription_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import torch
def transcribe(
audio_file: str,
language: str,
model_name: str,
compute_dtype: str,
suppress_numerals: bool,
device: str,
):
from faster_whisper import WhisperModel
from helpers import find_numeral_symbol_tokens, wav2vec2_langs
# Faster Whisper non-batched
# Run on GPU with FP16
whisper_model = WhisperModel(model_name, device=device, compute_type=compute_dtype)
# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")
if suppress_numerals:
numeral_symbol_tokens = find_numeral_symbol_tokens(whisper_model.hf_tokenizer)
else:
numeral_symbol_tokens = None
if language is not None and language in wav2vec2_langs:
word_timestamps = False
else:
word_timestamps = True
segments, info = whisper_model.transcribe(
audio_file,
language=language,
beam_size=5,
word_timestamps=word_timestamps, # TODO: disable this if the language is supported by wav2vec2
suppress_tokens=numeral_symbol_tokens,
vad_filter=True,
)
whisper_results = []
for segment in segments:
whisper_results.append(segment._asdict())
# clear gpu vram
del whisper_model
torch.cuda.empty_cache()
return whisper_results, info.language
def transcribe_batched(
audio_file: str,
language: str,
batch_size: int,
model_name: str,
compute_dtype: str,
suppress_numerals: bool,
device: str,
):
import whisperx
# Faster Whisper batched
whisper_model = whisperx.load_model(
model_name,
device,
compute_type=compute_dtype,
asr_options={"suppress_numerals": suppress_numerals},
)
audio = whisperx.load_audio(audio_file)
result = whisper_model.transcribe(audio, language=language, batch_size=batch_size)
del whisper_model
torch.cuda.empty_cache()
return result["segments"], result["language"], audio