Spaces:
Sleeping
Sleeping
| """ | |
| 最简单的tokenizer | |
| """ | |
| import json | |
| from tokenizers import Tokenizer | |
| tokenizer = Tokenizer.from_file("20B_tokenizer.json") | |
| print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True)) | |
| print("vocab_size without added_tokens:", tokenizer.get_vocab_size(with_added_tokens=False)) | |
| vocab = tokenizer.get_vocab() | |
| def to_unicode(text): | |
| return ''.join(r'\u{:04X}'.format(ord(chr)) for chr in text) | |
| def is_UTF_8(str): | |
| remain = 0 # 剩余byte数 | |
| for x in range(len(str)): | |
| if remain == 0: | |
| if (ord(str[x]) & 0x80) == 0x00: | |
| remain = 0 | |
| elif (ord(str[x]) & 0xE0) == 0xC0: | |
| remain = 1 | |
| elif (ord(str[x]) & 0xF0) == 0xE0: | |
| remain = 2 | |
| elif (ord(str[x]) & 0xF8) == 0xF0: | |
| remain = 3 | |
| else: | |
| return False | |
| else: | |
| if not ((ord(str[x]) & 0xC0) == 0x80): | |
| return False | |
| remain = remain - 1 | |
| if remain == 0: # 最后如果remain不等于零,可能没有匹配完整 | |
| return True | |
| else: | |
| return False | |
| def test_reverse(): | |
| f_out = open("reverse.jsonl", "w", encoding="utf-8") | |
| for token_id in range(tokenizer.get_vocab_size(with_added_tokens=False)): | |
| token = tokenizer.id_to_token(token_id) | |
| print(token_id, is_UTF_8(token)) | |
| if "Ġ" in token: | |
| continue | |
| encoding = tokenizer.encode(token) | |
| if len(encoding.ids) > 1 or encoding.ids[0] != token_id: | |
| f_out.write(json.dumps({"id": token_id, "token": token, "encoding": encoding.ids, "is_utf8": is_UTF_8(token), "isalpha": token.isalpha()}) + "\n") | |
| def test_single_token(): | |
| """ | |
| 单个字符的编码(一个字符可能会编码成多个id) | |
| """ | |
| for word in "发大厦三分赛中国解决方法黑白侗鸩,。!?;ĠABC": | |
| encoding = tokenizer.encode(word) | |
| for token_id in encoding.ids: | |
| decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd" | |
| token = tokenizer.id_to_token(token_id) | |
| print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token), token.encode("utf-8"), bytes(token, "utf-8"), to_unicode(token)) | |
| def test_long_token(): | |
| """ | |
| """ | |
| words = [ | |
| "//----------------------------------------------------------------", # 代码里有 | |
| "--------------------------", | |
| "-------------------------", | |
| "-----------------------", | |
| ] | |
| for word in words: | |
| encoding = tokenizer.encode(word) | |
| for token_id in encoding.ids: | |
| decode_str = tokenizer.decode([token_id]) # | |
| token = tokenizer.id_to_token(token_id) | |
| print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) | |
| def test_encode(): | |
| text = "中国解决方法黑白侗鸩,。!?;一个人去哪里 一 个" | |
| encoding = tokenizer.encode(text) | |
| for token_id in encoding.ids: | |
| decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd" | |
| token = tokenizer.id_to_token(token_id) | |
| print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) | |
| test_reverse() | |
| # test_single_token() | |
| # test_long_token() | |
| # test_encode() | |