File tree Expand file tree Collapse file tree 2 files changed +118
-101
lines changed Expand file tree Collapse file tree 2 files changed +118
-101
lines changed Original file line number Diff line number Diff line change
1
+ # Convert Hugging Face fine-tuned models to ggml format
2
+ #
3
+ # Usage:
4
+ #
5
+ # git clone https://github.com/openai/whisper
6
+ # git clone https://github.com/ggerganov/whisper.cpp
7
+ # git clone https://huggingface.co/openai/whisper-medium
8
+ #
9
+ # python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
10
+ #
11
+ # This script is similar to "convert-pt-to-ggml.py"
12
+ #
13
+ # For more info:
14
+ #
15
+ # https://github.com/ggerganov/whisper.cpp/issues/157
16
+ #
17
+
1
18
import io
2
19
import os
3
20
import sys
Original file line number Diff line number Diff line change 44
44
#from transformers import GPT2TokenizerFast
45
45
46
46
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
47
- LANGUAGES = {
48
- "en" : "english" ,
49
- "zh" : "chinese" ,
50
- "de" : "german" ,
51
- "es" : "spanish" ,
52
- "ru" : "russian" ,
53
- "ko" : "korean" ,
54
- "fr" : "french" ,
55
- "ja" : "japanese" ,
56
- "pt" : "portuguese" ,
57
- "tr" : "turkish" ,
58
- "pl" : "polish" ,
59
- "ca" : "catalan" ,
60
- "nl" : "dutch" ,
61
- "ar" : "arabic" ,
62
- "sv" : "swedish" ,
63
- "it" : "italian" ,
64
- "id" : "indonesian" ,
65
- "hi" : "hindi" ,
66
- "fi" : "finnish" ,
67
- "vi" : "vietnamese" ,
68
- "iw" : "hebrew" ,
69
- "uk" : "ukrainian" ,
70
- "el" : "greek" ,
71
- "ms" : "malay" ,
72
- "cs" : "czech" ,
73
- "ro" : "romanian" ,
74
- "da" : "danish" ,
75
- "hu" : "hungarian" ,
76
- "ta" : "tamil" ,
77
- "no" : "norwegian" ,
78
- "th" : "thai" ,
79
- "ur" : "urdu" ,
80
- "hr" : "croatian" ,
81
- "bg" : "bulgarian" ,
82
- "lt" : "lithuanian" ,
83
- "la" : "latin" ,
84
- "mi" : "maori" ,
85
- "ml" : "malayalam" ,
86
- "cy" : "welsh" ,
87
- "sk" : "slovak" ,
88
- "te" : "telugu" ,
89
- "fa" : "persian" ,
90
- "lv" : "latvian" ,
91
- "bn" : "bengali" ,
92
- "sr" : "serbian" ,
93
- "az" : "azerbaijani" ,
94
- "sl" : "slovenian" ,
95
- "kn" : "kannada" ,
96
- "et" : "estonian" ,
97
- "mk" : "macedonian" ,
98
- "br" : "breton" ,
99
- "eu" : "basque" ,
100
- "is" : "icelandic" ,
101
- "hy" : "armenian" ,
102
- "ne" : "nepali" ,
103
- "mn" : "mongolian" ,
104
- "bs" : "bosnian" ,
105
- "kk" : "kazakh" ,
106
- "sq" : "albanian" ,
107
- "sw" : "swahili" ,
108
- "gl" : "galician" ,
109
- "mr" : "marathi" ,
110
- "pa" : "punjabi" ,
111
- "si" : "sinhala" ,
112
- "km" : "khmer" ,
113
- "sn" : "shona" ,
114
- "yo" : "yoruba" ,
115
- "so" : "somali" ,
116
- "af" : "afrikaans" ,
117
- "oc" : "occitan" ,
118
- "ka" : "georgian" ,
119
- "be" : "belarusian" ,
120
- "tg" : "tajik" ,
121
- "sd" : "sindhi" ,
122
- "gu" : "gujarati" ,
123
- "am" : "amharic" ,
124
- "yi" : "yiddish" ,
125
- "lo" : "lao" ,
126
- "uz" : "uzbek" ,
127
- "fo" : "faroese" ,
128
- "ht" : "haitian creole" ,
129
- "ps" : "pashto" ,
130
- "tk" : "turkmen" ,
131
- "nn" : "nynorsk" ,
132
- "mt" : "maltese" ,
133
- "sa" : "sanskrit" ,
134
- "lb" : "luxembourgish" ,
135
- "my" : "myanmar" ,
136
- "bo" : "tibetan" ,
137
- "tl" : "tagalog" ,
138
- "mg" : "malagasy" ,
139
- "as" : "assamese" ,
140
- "tt" : "tatar" ,
141
- "haw" : "hawaiian" ,
142
- "ln" : "lingala" ,
143
- "ha" : "hausa" ,
144
- "ba" : "bashkir" ,
145
- "jw" : "javanese" ,
146
- "su" : "sundanese" ,
147
- }
47
+ # LANGUAGES = {
48
+ # "en": "english",
49
+ # "zh": "chinese",
50
+ # "de": "german",
51
+ # "es": "spanish",
52
+ # "ru": "russian",
53
+ # "ko": "korean",
54
+ # "fr": "french",
55
+ # "ja": "japanese",
56
+ # "pt": "portuguese",
57
+ # "tr": "turkish",
58
+ # "pl": "polish",
59
+ # "ca": "catalan",
60
+ # "nl": "dutch",
61
+ # "ar": "arabic",
62
+ # "sv": "swedish",
63
+ # "it": "italian",
64
+ # "id": "indonesian",
65
+ # "hi": "hindi",
66
+ # "fi": "finnish",
67
+ # "vi": "vietnamese",
68
+ # "iw": "hebrew",
69
+ # "uk": "ukrainian",
70
+ # "el": "greek",
71
+ # "ms": "malay",
72
+ # "cs": "czech",
73
+ # "ro": "romanian",
74
+ # "da": "danish",
75
+ # "hu": "hungarian",
76
+ # "ta": "tamil",
77
+ # "no": "norwegian",
78
+ # "th": "thai",
79
+ # "ur": "urdu",
80
+ # "hr": "croatian",
81
+ # "bg": "bulgarian",
82
+ # "lt": "lithuanian",
83
+ # "la": "latin",
84
+ # "mi": "maori",
85
+ # "ml": "malayalam",
86
+ # "cy": "welsh",
87
+ # "sk": "slovak",
88
+ # "te": "telugu",
89
+ # "fa": "persian",
90
+ # "lv": "latvian",
91
+ # "bn": "bengali",
92
+ # "sr": "serbian",
93
+ # "az": "azerbaijani",
94
+ # "sl": "slovenian",
95
+ # "kn": "kannada",
96
+ # "et": "estonian",
97
+ # "mk": "macedonian",
98
+ # "br": "breton",
99
+ # "eu": "basque",
100
+ # "is": "icelandic",
101
+ # "hy": "armenian",
102
+ # "ne": "nepali",
103
+ # "mn": "mongolian",
104
+ # "bs": "bosnian",
105
+ # "kk": "kazakh",
106
+ # "sq": "albanian",
107
+ # "sw": "swahili",
108
+ # "gl": "galician",
109
+ # "mr": "marathi",
110
+ # "pa": "punjabi",
111
+ # "si": "sinhala",
112
+ # "km": "khmer",
113
+ # "sn": "shona",
114
+ # "yo": "yoruba",
115
+ # "so": "somali",
116
+ # "af": "afrikaans",
117
+ # "oc": "occitan",
118
+ # "ka": "georgian",
119
+ # "be": "belarusian",
120
+ # "tg": "tajik",
121
+ # "sd": "sindhi",
122
+ # "gu": "gujarati",
123
+ # "am": "amharic",
124
+ # "yi": "yiddish",
125
+ # "lo": "lao",
126
+ # "uz": "uzbek",
127
+ # "fo": "faroese",
128
+ # "ht": "haitian creole",
129
+ # "ps": "pashto",
130
+ # "tk": "turkmen",
131
+ # "nn": "nynorsk",
132
+ # "mt": "maltese",
133
+ # "sa": "sanskrit",
134
+ # "lb": "luxembourgish",
135
+ # "my": "myanmar",
136
+ # "bo": "tibetan",
137
+ # "tl": "tagalog",
138
+ # "mg": "malagasy",
139
+ # "as": "assamese",
140
+ # "tt": "tatar",
141
+ # "haw": "hawaiian",
142
+ # "ln": "lingala",
143
+ # "ha": "hausa",
144
+ # "ba": "bashkir",
145
+ # "jw": "javanese",
146
+ # "su": "sundanese",
147
+ # }
148
148
149
149
## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
150
150
#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
You can’t perform that action at this time.
0 commit comments