Ivan Gorin commited on
Commit
a7879d3
·
unverified ·
1 Parent(s): 5aaa017

models : change convert-pt-to-ggml to use .tiktoken tokenizer files (#725)

Browse files
Files changed (1) hide show
  1. models/convert-pt-to-ggml.py +7 -11
models/convert-pt-to-ggml.py CHANGED
@@ -39,6 +39,7 @@ import json
39
  import code
40
  import torch
41
  import numpy as np
 
42
 
43
  #from transformers import GPTJForCausalLM
44
  #from transformers import GPT2TokenizerFast
@@ -224,18 +225,14 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as
224
  #code.interact(local=locals())
225
 
226
  multilingual = hparams["n_vocab"] == 51865
227
- dir_tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual" or "gpt2")
228
-
229
- #tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
230
- #print(tokenizer)
231
- #print(tokenizer.name_or_path)
232
- #print(len(tokenizer.additional_special_tokens))
233
 
234
  # output in the same directory as the model
235
  fname_out = dir_out + "/ggml-model.bin"
236
 
237
- with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f:
238
- tokens = json.load(f)
 
239
 
240
  # use 16-bit or 32-bit floats
241
  use_f16 = True
@@ -271,9 +268,8 @@ byte_decoder = {v:k for k, v in byte_encoder.items()}
271
  fout.write(struct.pack("i", len(tokens)))
272
 
273
  for key in tokens:
274
- text = bytearray([byte_decoder[c] for c in key])
275
- fout.write(struct.pack("i", len(text)))
276
- fout.write(text)
277
 
278
  for name in list_vars.keys():
279
  data = list_vars[name].squeeze().numpy()
 
39
  import code
40
  import torch
41
  import numpy as np
42
+ import base64
43
 
44
  #from transformers import GPTJForCausalLM
45
  #from transformers import GPT2TokenizerFast
 
225
  #code.interact(local=locals())
226
 
227
  multilingual = hparams["n_vocab"] == 51865
228
+ tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
 
 
 
 
 
229
 
230
  # output in the same directory as the model
231
  fname_out = dir_out + "/ggml-model.bin"
232
 
233
+ with open(tokenizer, "rb") as f:
234
+ contents = f.read()
235
+ tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
236
 
237
  # use 16-bit or 32-bit floats
238
  use_f16 = True
 
268
  fout.write(struct.pack("i", len(tokens)))
269
 
270
  for key in tokens:
271
+ fout.write(struct.pack("i", len(key)))
272
+ fout.write(key)
 
273
 
274
  for name in list_vars.keys():
275
  data = list_vars[name].squeeze().numpy()