Spaces:
Running
Running
| import os | |
| import pickle | |
| import re | |
| import glob | |
| import tiktoken | |
| from dotenv import load_dotenv | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_chroma import Chroma | |
| from rank_bm25 import BM25Okapi | |
| from langchain_core.documents import Document | |
| load_dotenv() | |
| DATA_DIR = "./data" | |
| DB_PATH = "./db/chroma" | |
| BM25_PATH = "./db/bm25.pkl" | |
| def ingest_data(): | |
| if not os.path.exists("./db"): | |
| os.makedirs("./db") | |
| print(">>> [Step 1] Loading & Splitting Data...") | |
| file_paths = glob.glob(os.path.join(DATA_DIR, "*.txt")) | |
| if not file_paths: | |
| print("β No .txt files found.") | |
| return | |
| all_texts = [] | |
| # μ²ν¬ μ¬μ΄μ¦ μ€μ | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=600, | |
| chunk_overlap=100, | |
| separators=["\n\n", "\n", " ", ""] | |
| ) | |
| for file_path in file_paths: | |
| file_name = os.path.basename(file_path).replace(".txt", "") | |
| print(f" - Processing: {file_name}") | |
| loader = TextLoader(file_path, encoding="utf-8") | |
| documents = loader.load() | |
| texts = text_splitter.split_documents(documents) | |
| for doc in texts: | |
| doc.metadata["source"] = file_name | |
| # Rule-based κ²μμ μν μ‘°ν λ²νΈ μΆμΆ | |
| match = re.search(r"μ \s*(\d+)\s*μ‘°", doc.page_content) | |
| doc.metadata["article_id"] = match.group(1) if match else "None" | |
| all_texts.extend(texts) | |
| print(f"β Total chunks: {len(all_texts)}") | |
| # 2. ChromaDB (Vector) | |
| print("\n>>> [Step 2] Creating Vector DB (Chroma)...") | |
| embeddings = OpenAIEmbeddings(model="text-embedding-3-small") | |
| vector_store = Chroma.from_documents( | |
| documents=all_texts, | |
| embedding=embeddings, | |
| persist_directory=DB_PATH, | |
| collection_name="hanyang_rules" | |
| ) | |
| print("β Vector DB created.") | |
| # 3. BM25 (Lexical) - Tiktoken μ¬μ© | |
| print("\n>>> [Step 3] Creating BM25 Index (with Tiktoken)...") | |
| # GPT-5-mini λͺ¨λΈμ΄ μ¬μ©νλ ν ν¬λμ΄μ λ‘λ | |
| try: | |
| tokenizer = tiktoken.encoding_for_model("gpt-5-mini") | |
| except KeyError: | |
| tokenizer = tiktoken.get_encoding("cl100k_base") | |
| def tiktoken_tokenizer(text): | |
| # μ μ ID 리μ€νΈλ₯Ό λ¬Έμμ΄ λ¦¬μ€νΈλ‘ λ³νν΄μΌ BM25κ° μλν¨ | |
| tokens = tokenizer.encode(text) | |
| return [str(t) for t in tokens] | |
| tokenized_corpus = [tiktoken_tokenizer(doc.page_content) for doc in all_texts] | |
| bm25 = BM25Okapi(tokenized_corpus) | |
| bm25_data = { | |
| "bm25": bm25, | |
| "documents": all_texts | |
| } | |
| with open(BM25_PATH, "wb") as f: | |
| pickle.dump(bm25_data, f) | |
| print("β BM25 Index saved.") | |
| print("\n Ingestion Complete!") | |
| if __name__ == "__main__": | |
| ingest_data() |