Spaces:
Sleeping
Sleeping
| import duckdb | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from safetensors.numpy import save_file | |
| from tqdm import tqdm | |
| conn = duckdb.connect("sonajaht.db") | |
| model = SentenceTransformer("sentence-transformers/LaBSE") | |
| query = "SELECT value FROM definitions" | |
| result = conn.execute(query) | |
| vectors = [] | |
| batch_size = 64 | |
| p_bar = tqdm() | |
| while True: | |
| chunk = result.fetchmany(batch_size) | |
| if not chunk: | |
| break | |
| values = [row[0] for row in chunk] | |
| vectors.append( | |
| model.encode( | |
| values, show_progress_bar=False, batch_size=batch_size, device="mps" | |
| ) | |
| ) | |
| p_bar.update(batch_size) | |
| vectors = np.concatenate(vectors) | |
| save_file(dict(vectors=vectors), "definitions.safetensors") | |
| conn.close() | |