Skip to content

Commit 00f46db

Browse files
committed
models : add usage comments to the HF convert script (ggml-org#157)
1 parent 5698bdd commit 00f46db

File tree

2 files changed

+118
-101
lines changed

2 files changed

+118
-101
lines changed

models/convert-h5-to-ggml.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,20 @@
1+
# Convert Hugging Face fine-tuned models to ggml format
2+
#
3+
# Usage:
4+
#
5+
# git clone https://github.com/openai/whisper
6+
# git clone https://github.com/ggerganov/whisper.cpp
7+
# git clone https://huggingface.co/openai/whisper-medium
8+
#
9+
# python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
10+
#
11+
# This script is similar to "convert-pt-to-ggml.py"
12+
#
13+
# For more info:
14+
#
15+
# https://github.com/ggerganov/whisper.cpp/issues/157
16+
#
17+
118
import io
219
import os
320
import sys

models/convert-pt-to-ggml.py

Lines changed: 101 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -44,107 +44,107 @@
4444
#from transformers import GPT2TokenizerFast
4545

4646
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
47-
LANGUAGES = {
48-
"en": "english",
49-
"zh": "chinese",
50-
"de": "german",
51-
"es": "spanish",
52-
"ru": "russian",
53-
"ko": "korean",
54-
"fr": "french",
55-
"ja": "japanese",
56-
"pt": "portuguese",
57-
"tr": "turkish",
58-
"pl": "polish",
59-
"ca": "catalan",
60-
"nl": "dutch",
61-
"ar": "arabic",
62-
"sv": "swedish",
63-
"it": "italian",
64-
"id": "indonesian",
65-
"hi": "hindi",
66-
"fi": "finnish",
67-
"vi": "vietnamese",
68-
"iw": "hebrew",
69-
"uk": "ukrainian",
70-
"el": "greek",
71-
"ms": "malay",
72-
"cs": "czech",
73-
"ro": "romanian",
74-
"da": "danish",
75-
"hu": "hungarian",
76-
"ta": "tamil",
77-
"no": "norwegian",
78-
"th": "thai",
79-
"ur": "urdu",
80-
"hr": "croatian",
81-
"bg": "bulgarian",
82-
"lt": "lithuanian",
83-
"la": "latin",
84-
"mi": "maori",
85-
"ml": "malayalam",
86-
"cy": "welsh",
87-
"sk": "slovak",
88-
"te": "telugu",
89-
"fa": "persian",
90-
"lv": "latvian",
91-
"bn": "bengali",
92-
"sr": "serbian",
93-
"az": "azerbaijani",
94-
"sl": "slovenian",
95-
"kn": "kannada",
96-
"et": "estonian",
97-
"mk": "macedonian",
98-
"br": "breton",
99-
"eu": "basque",
100-
"is": "icelandic",
101-
"hy": "armenian",
102-
"ne": "nepali",
103-
"mn": "mongolian",
104-
"bs": "bosnian",
105-
"kk": "kazakh",
106-
"sq": "albanian",
107-
"sw": "swahili",
108-
"gl": "galician",
109-
"mr": "marathi",
110-
"pa": "punjabi",
111-
"si": "sinhala",
112-
"km": "khmer",
113-
"sn": "shona",
114-
"yo": "yoruba",
115-
"so": "somali",
116-
"af": "afrikaans",
117-
"oc": "occitan",
118-
"ka": "georgian",
119-
"be": "belarusian",
120-
"tg": "tajik",
121-
"sd": "sindhi",
122-
"gu": "gujarati",
123-
"am": "amharic",
124-
"yi": "yiddish",
125-
"lo": "lao",
126-
"uz": "uzbek",
127-
"fo": "faroese",
128-
"ht": "haitian creole",
129-
"ps": "pashto",
130-
"tk": "turkmen",
131-
"nn": "nynorsk",
132-
"mt": "maltese",
133-
"sa": "sanskrit",
134-
"lb": "luxembourgish",
135-
"my": "myanmar",
136-
"bo": "tibetan",
137-
"tl": "tagalog",
138-
"mg": "malagasy",
139-
"as": "assamese",
140-
"tt": "tatar",
141-
"haw": "hawaiian",
142-
"ln": "lingala",
143-
"ha": "hausa",
144-
"ba": "bashkir",
145-
"jw": "javanese",
146-
"su": "sundanese",
147-
}
47+
#LANGUAGES = {
48+
# "en": "english",
49+
# "zh": "chinese",
50+
# "de": "german",
51+
# "es": "spanish",
52+
# "ru": "russian",
53+
# "ko": "korean",
54+
# "fr": "french",
55+
# "ja": "japanese",
56+
# "pt": "portuguese",
57+
# "tr": "turkish",
58+
# "pl": "polish",
59+
# "ca": "catalan",
60+
# "nl": "dutch",
61+
# "ar": "arabic",
62+
# "sv": "swedish",
63+
# "it": "italian",
64+
# "id": "indonesian",
65+
# "hi": "hindi",
66+
# "fi": "finnish",
67+
# "vi": "vietnamese",
68+
# "iw": "hebrew",
69+
# "uk": "ukrainian",
70+
# "el": "greek",
71+
# "ms": "malay",
72+
# "cs": "czech",
73+
# "ro": "romanian",
74+
# "da": "danish",
75+
# "hu": "hungarian",
76+
# "ta": "tamil",
77+
# "no": "norwegian",
78+
# "th": "thai",
79+
# "ur": "urdu",
80+
# "hr": "croatian",
81+
# "bg": "bulgarian",
82+
# "lt": "lithuanian",
83+
# "la": "latin",
84+
# "mi": "maori",
85+
# "ml": "malayalam",
86+
# "cy": "welsh",
87+
# "sk": "slovak",
88+
# "te": "telugu",
89+
# "fa": "persian",
90+
# "lv": "latvian",
91+
# "bn": "bengali",
92+
# "sr": "serbian",
93+
# "az": "azerbaijani",
94+
# "sl": "slovenian",
95+
# "kn": "kannada",
96+
# "et": "estonian",
97+
# "mk": "macedonian",
98+
# "br": "breton",
99+
# "eu": "basque",
100+
# "is": "icelandic",
101+
# "hy": "armenian",
102+
# "ne": "nepali",
103+
# "mn": "mongolian",
104+
# "bs": "bosnian",
105+
# "kk": "kazakh",
106+
# "sq": "albanian",
107+
# "sw": "swahili",
108+
# "gl": "galician",
109+
# "mr": "marathi",
110+
# "pa": "punjabi",
111+
# "si": "sinhala",
112+
# "km": "khmer",
113+
# "sn": "shona",
114+
# "yo": "yoruba",
115+
# "so": "somali",
116+
# "af": "afrikaans",
117+
# "oc": "occitan",
118+
# "ka": "georgian",
119+
# "be": "belarusian",
120+
# "tg": "tajik",
121+
# "sd": "sindhi",
122+
# "gu": "gujarati",
123+
# "am": "amharic",
124+
# "yi": "yiddish",
125+
# "lo": "lao",
126+
# "uz": "uzbek",
127+
# "fo": "faroese",
128+
# "ht": "haitian creole",
129+
# "ps": "pashto",
130+
# "tk": "turkmen",
131+
# "nn": "nynorsk",
132+
# "mt": "maltese",
133+
# "sa": "sanskrit",
134+
# "lb": "luxembourgish",
135+
# "my": "myanmar",
136+
# "bo": "tibetan",
137+
# "tl": "tagalog",
138+
# "mg": "malagasy",
139+
# "as": "assamese",
140+
# "tt": "tatar",
141+
# "haw": "hawaiian",
142+
# "ln": "lingala",
143+
# "ha": "hausa",
144+
# "ba": "bashkir",
145+
# "jw": "javanese",
146+
# "su": "sundanese",
147+
#}
148148

149149
## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
150150
#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):

0 commit comments

Comments
 (0)