ggerganov commited on
Commit
8528ec5
·
unverified ·
1 Parent(s): 6c811ac

models : add usage comments to the HF convert script (#157)

Browse files
models/convert-h5-to-ggml.py CHANGED
@@ -1,3 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import io
2
  import os
3
  import sys
 
1
+ # Convert Hugging Face fine-tuned models to ggml format
2
+ #
3
+ # Usage:
4
+ #
5
+ # git clone https://github.com/openai/whisper
6
+ # git clone https://github.com/ggerganov/whisper.cpp
7
+ # git clone https://huggingface.co/openai/whisper-medium
8
+ #
9
+ # python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
10
+ #
11
+ # This script is similar to "convert-pt-to-ggml.py"
12
+ #
13
+ # For more info:
14
+ #
15
+ # https://github.com/ggerganov/whisper.cpp/issues/157
16
+ #
17
+
18
  import io
19
  import os
20
  import sys
models/convert-pt-to-ggml.py CHANGED
@@ -44,107 +44,107 @@ import numpy as np
44
  #from transformers import GPT2TokenizerFast
45
 
46
  # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
47
- LANGUAGES = {
48
- "en": "english",
49
- "zh": "chinese",
50
- "de": "german",
51
- "es": "spanish",
52
- "ru": "russian",
53
- "ko": "korean",
54
- "fr": "french",
55
- "ja": "japanese",
56
- "pt": "portuguese",
57
- "tr": "turkish",
58
- "pl": "polish",
59
- "ca": "catalan",
60
- "nl": "dutch",
61
- "ar": "arabic",
62
- "sv": "swedish",
63
- "it": "italian",
64
- "id": "indonesian",
65
- "hi": "hindi",
66
- "fi": "finnish",
67
- "vi": "vietnamese",
68
- "iw": "hebrew",
69
- "uk": "ukrainian",
70
- "el": "greek",
71
- "ms": "malay",
72
- "cs": "czech",
73
- "ro": "romanian",
74
- "da": "danish",
75
- "hu": "hungarian",
76
- "ta": "tamil",
77
- "no": "norwegian",
78
- "th": "thai",
79
- "ur": "urdu",
80
- "hr": "croatian",
81
- "bg": "bulgarian",
82
- "lt": "lithuanian",
83
- "la": "latin",
84
- "mi": "maori",
85
- "ml": "malayalam",
86
- "cy": "welsh",
87
- "sk": "slovak",
88
- "te": "telugu",
89
- "fa": "persian",
90
- "lv": "latvian",
91
- "bn": "bengali",
92
- "sr": "serbian",
93
- "az": "azerbaijani",
94
- "sl": "slovenian",
95
- "kn": "kannada",
96
- "et": "estonian",
97
- "mk": "macedonian",
98
- "br": "breton",
99
- "eu": "basque",
100
- "is": "icelandic",
101
- "hy": "armenian",
102
- "ne": "nepali",
103
- "mn": "mongolian",
104
- "bs": "bosnian",
105
- "kk": "kazakh",
106
- "sq": "albanian",
107
- "sw": "swahili",
108
- "gl": "galician",
109
- "mr": "marathi",
110
- "pa": "punjabi",
111
- "si": "sinhala",
112
- "km": "khmer",
113
- "sn": "shona",
114
- "yo": "yoruba",
115
- "so": "somali",
116
- "af": "afrikaans",
117
- "oc": "occitan",
118
- "ka": "georgian",
119
- "be": "belarusian",
120
- "tg": "tajik",
121
- "sd": "sindhi",
122
- "gu": "gujarati",
123
- "am": "amharic",
124
- "yi": "yiddish",
125
- "lo": "lao",
126
- "uz": "uzbek",
127
- "fo": "faroese",
128
- "ht": "haitian creole",
129
- "ps": "pashto",
130
- "tk": "turkmen",
131
- "nn": "nynorsk",
132
- "mt": "maltese",
133
- "sa": "sanskrit",
134
- "lb": "luxembourgish",
135
- "my": "myanmar",
136
- "bo": "tibetan",
137
- "tl": "tagalog",
138
- "mg": "malagasy",
139
- "as": "assamese",
140
- "tt": "tatar",
141
- "haw": "hawaiian",
142
- "ln": "lingala",
143
- "ha": "hausa",
144
- "ba": "bashkir",
145
- "jw": "javanese",
146
- "su": "sundanese",
147
- }
148
 
149
  ## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
150
  #def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
 
44
  #from transformers import GPT2TokenizerFast
45
 
46
  # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
47
+ #LANGUAGES = {
48
+ # "en": "english",
49
+ # "zh": "chinese",
50
+ # "de": "german",
51
+ # "es": "spanish",
52
+ # "ru": "russian",
53
+ # "ko": "korean",
54
+ # "fr": "french",
55
+ # "ja": "japanese",
56
+ # "pt": "portuguese",
57
+ # "tr": "turkish",
58
+ # "pl": "polish",
59
+ # "ca": "catalan",
60
+ # "nl": "dutch",
61
+ # "ar": "arabic",
62
+ # "sv": "swedish",
63
+ # "it": "italian",
64
+ # "id": "indonesian",
65
+ # "hi": "hindi",
66
+ # "fi": "finnish",
67
+ # "vi": "vietnamese",
68
+ # "iw": "hebrew",
69
+ # "uk": "ukrainian",
70
+ # "el": "greek",
71
+ # "ms": "malay",
72
+ # "cs": "czech",
73
+ # "ro": "romanian",
74
+ # "da": "danish",
75
+ # "hu": "hungarian",
76
+ # "ta": "tamil",
77
+ # "no": "norwegian",
78
+ # "th": "thai",
79
+ # "ur": "urdu",
80
+ # "hr": "croatian",
81
+ # "bg": "bulgarian",
82
+ # "lt": "lithuanian",
83
+ # "la": "latin",
84
+ # "mi": "maori",
85
+ # "ml": "malayalam",
86
+ # "cy": "welsh",
87
+ # "sk": "slovak",
88
+ # "te": "telugu",
89
+ # "fa": "persian",
90
+ # "lv": "latvian",
91
+ # "bn": "bengali",
92
+ # "sr": "serbian",
93
+ # "az": "azerbaijani",
94
+ # "sl": "slovenian",
95
+ # "kn": "kannada",
96
+ # "et": "estonian",
97
+ # "mk": "macedonian",
98
+ # "br": "breton",
99
+ # "eu": "basque",
100
+ # "is": "icelandic",
101
+ # "hy": "armenian",
102
+ # "ne": "nepali",
103
+ # "mn": "mongolian",
104
+ # "bs": "bosnian",
105
+ # "kk": "kazakh",
106
+ # "sq": "albanian",
107
+ # "sw": "swahili",
108
+ # "gl": "galician",
109
+ # "mr": "marathi",
110
+ # "pa": "punjabi",
111
+ # "si": "sinhala",
112
+ # "km": "khmer",
113
+ # "sn": "shona",
114
+ # "yo": "yoruba",
115
+ # "so": "somali",
116
+ # "af": "afrikaans",
117
+ # "oc": "occitan",
118
+ # "ka": "georgian",
119
+ # "be": "belarusian",
120
+ # "tg": "tajik",
121
+ # "sd": "sindhi",
122
+ # "gu": "gujarati",
123
+ # "am": "amharic",
124
+ # "yi": "yiddish",
125
+ # "lo": "lao",
126
+ # "uz": "uzbek",
127
+ # "fo": "faroese",
128
+ # "ht": "haitian creole",
129
+ # "ps": "pashto",
130
+ # "tk": "turkmen",
131
+ # "nn": "nynorsk",
132
+ # "mt": "maltese",
133
+ # "sa": "sanskrit",
134
+ # "lb": "luxembourgish",
135
+ # "my": "myanmar",
136
+ # "bo": "tibetan",
137
+ # "tl": "tagalog",
138
+ # "mg": "malagasy",
139
+ # "as": "assamese",
140
+ # "tt": "tatar",
141
+ # "haw": "hawaiian",
142
+ # "ln": "lingala",
143
+ # "ha": "hausa",
144
+ # "ba": "bashkir",
145
+ # "jw": "javanese",
146
+ # "su": "sundanese",
147
+ #}
148
 
149
  ## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
150
  #def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):