import os import pickle import re import glob import tiktoken from dotenv import load_dotenv from langchain_community.document_loaders import TextLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_chroma import Chroma from rank_bm25 import BM25Okapi from langchain_core.documents import Document load_dotenv() DATA_DIR = "./data" DB_PATH = "./db/chroma" BM25_PATH = "./db/bm25.pkl" def ingest_data(): if not os.path.exists("./db"): os.makedirs("./db") print(">>> [Step 1] Loading & Splitting Data...") file_paths = glob.glob(os.path.join(DATA_DIR, "*.txt")) if not file_paths: print("❌ No .txt files found.") return all_texts = [] # 청크 사이즈 설정 text_splitter = RecursiveCharacterTextSplitter( chunk_size=600, chunk_overlap=100, separators=["\n\n", "\n", " ", ""] ) for file_path in file_paths: file_name = os.path.basename(file_path).replace(".txt", "") print(f" - Processing: {file_name}") loader = TextLoader(file_path, encoding="utf-8") documents = loader.load() texts = text_splitter.split_documents(documents) for doc in texts: doc.metadata["source"] = file_name # Rule-based 검색을 위한 조항 번호 추출 match = re.search(r"제\s*(\d+)\s*조", doc.page_content) doc.metadata["article_id"] = match.group(1) if match else "None" all_texts.extend(texts) print(f"✅ Total chunks: {len(all_texts)}") # 2. ChromaDB (Vector) print("\n>>> [Step 2] Creating Vector DB (Chroma)...") embeddings = OpenAIEmbeddings(model="text-embedding-3-small") vector_store = Chroma.from_documents( documents=all_texts, embedding=embeddings, persist_directory=DB_PATH, collection_name="hanyang_rules" ) print("✅ Vector DB created.") # 3. BM25 (Lexical) - Tiktoken 사용 print("\n>>> [Step 3] Creating BM25 Index (with Tiktoken)...") # GPT-5-mini 모델이 사용하는 토크나이저 로드 try: tokenizer = tiktoken.encoding_for_model("gpt-5-mini") except KeyError: tokenizer = tiktoken.get_encoding("cl100k_base") def tiktoken_tokenizer(text): # 정수 ID 리스트를 문자열 리스트로 변환해야 BM25가 작동함 tokens = tokenizer.encode(text) return [str(t) for t in tokens] tokenized_corpus = [tiktoken_tokenizer(doc.page_content) for doc in all_texts] bm25 = BM25Okapi(tokenized_corpus) bm25_data = { "bm25": bm25, "documents": all_texts } with open(BM25_PATH, "wb") as f: pickle.dump(bm25_data, f) print("✅ BM25 Index saved.") print("\n Ingestion Complete!") if __name__ == "__main__": ingest_data()