Spaces:
Running
Running
| import os | |
| import sys | |
| import json | |
| import pandas as pd | |
| # pylint: disable=wrong-import-position | |
| root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| sys.path.append(root_dir) | |
| from graphgen.models import Tokenizer | |
| def count_tokens(file, tokenizer_name, data_frame): | |
| if not file or not os.path.exists(file): | |
| return data_frame | |
| if file.endswith(".jsonl"): | |
| with open(file, "r", encoding='utf-8') as f: | |
| data = [json.loads(line) for line in f] | |
| elif file.endswith(".json"): | |
| with open(file, "r", encoding='utf-8') as f: | |
| data = json.load(f) | |
| data = [item for sublist in data for item in sublist] | |
| elif file.endswith(".txt"): | |
| with open(file, "r", encoding='utf-8') as f: | |
| data = f.read() | |
| chunks = [ | |
| data[i:i + 512] for i in range(0, len(data), 512) | |
| ] | |
| data = [{"content": chunk} for chunk in chunks] | |
| else: | |
| raise ValueError(f"Unsupported file type: {file}") | |
| tokenizer = Tokenizer(tokenizer_name) | |
| # Count tokens | |
| token_count = 0 | |
| for item in data: | |
| if isinstance(item, dict): | |
| content = item.get("content", "") | |
| else: | |
| content = item | |
| token_count += len(tokenizer.encode_string(content)) | |
| _update_data = [[ | |
| str(token_count), | |
| str(token_count * 50), | |
| "N/A" | |
| ]] | |
| try: | |
| new_df = pd.DataFrame( | |
| _update_data, | |
| columns=data_frame.columns | |
| ) | |
| data_frame = new_df | |
| except Exception as e: # pylint: disable=broad-except | |
| print("[ERROR] DataFrame操作异常:", str(e)) | |
| return data_frame | |