Spaces:
Running
Running
Ivan Gorin
commited on
models : change convert-pt-to-ggml to use .tiktoken tokenizer files (#725)
Browse files- models/convert-pt-to-ggml.py +7 -11
models/convert-pt-to-ggml.py
CHANGED
|
@@ -39,6 +39,7 @@ import json
|
|
| 39 |
import code
|
| 40 |
import torch
|
| 41 |
import numpy as np
|
|
|
|
| 42 |
|
| 43 |
#from transformers import GPTJForCausalLM
|
| 44 |
#from transformers import GPT2TokenizerFast
|
|
@@ -224,18 +225,14 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as
|
|
| 224 |
#code.interact(local=locals())
|
| 225 |
|
| 226 |
multilingual = hparams["n_vocab"] == 51865
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
#tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
|
| 230 |
-
#print(tokenizer)
|
| 231 |
-
#print(tokenizer.name_or_path)
|
| 232 |
-
#print(len(tokenizer.additional_special_tokens))
|
| 233 |
|
| 234 |
# output in the same directory as the model
|
| 235 |
fname_out = dir_out + "/ggml-model.bin"
|
| 236 |
|
| 237 |
-
with open(
|
| 238 |
-
|
|
|
|
| 239 |
|
| 240 |
# use 16-bit or 32-bit floats
|
| 241 |
use_f16 = True
|
|
@@ -271,9 +268,8 @@ byte_decoder = {v:k for k, v in byte_encoder.items()}
|
|
| 271 |
fout.write(struct.pack("i", len(tokens)))
|
| 272 |
|
| 273 |
for key in tokens:
|
| 274 |
-
|
| 275 |
-
fout.write(
|
| 276 |
-
fout.write(text)
|
| 277 |
|
| 278 |
for name in list_vars.keys():
|
| 279 |
data = list_vars[name].squeeze().numpy()
|
|
|
|
| 39 |
import code
|
| 40 |
import torch
|
| 41 |
import numpy as np
|
| 42 |
+
import base64
|
| 43 |
|
| 44 |
#from transformers import GPTJForCausalLM
|
| 45 |
#from transformers import GPT2TokenizerFast
|
|
|
|
| 225 |
#code.interact(local=locals())
|
| 226 |
|
| 227 |
multilingual = hparams["n_vocab"] == 51865
|
| 228 |
+
tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
# output in the same directory as the model
|
| 231 |
fname_out = dir_out + "/ggml-model.bin"
|
| 232 |
|
| 233 |
+
with open(tokenizer, "rb") as f:
|
| 234 |
+
contents = f.read()
|
| 235 |
+
tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
|
| 236 |
|
| 237 |
# use 16-bit or 32-bit floats
|
| 238 |
use_f16 = True
|
|
|
|
| 268 |
fout.write(struct.pack("i", len(tokens)))
|
| 269 |
|
| 270 |
for key in tokens:
|
| 271 |
+
fout.write(struct.pack("i", len(key)))
|
| 272 |
+
fout.write(key)
|
|
|
|
| 273 |
|
| 274 |
for name in list_vars.keys():
|
| 275 |
data = list_vars[name].squeeze().numpy()
|