Spaces:
Running
Running
File size: 2,936 Bytes
5063d6d 561a0de 5063d6d 561a0de 5063d6d 561a0de 5063d6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import os
import pickle
import re
import glob
import tiktoken
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from rank_bm25 import BM25Okapi
from langchain_core.documents import Document
load_dotenv()
DATA_DIR = "./data"
DB_PATH = "./db/chroma"
BM25_PATH = "./db/bm25.pkl"
def ingest_data():
if not os.path.exists("./db"):
os.makedirs("./db")
print(">>> [Step 1] Loading & Splitting Data...")
file_paths = glob.glob(os.path.join(DATA_DIR, "*.txt"))
if not file_paths:
print("β No .txt files found.")
return
all_texts = []
# μ²ν¬ μ¬μ΄μ¦ μ€μ
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=600,
chunk_overlap=100,
separators=["\n\n", "\n", " ", ""]
)
for file_path in file_paths:
file_name = os.path.basename(file_path).replace(".txt", "")
print(f" - Processing: {file_name}")
loader = TextLoader(file_path, encoding="utf-8")
documents = loader.load()
texts = text_splitter.split_documents(documents)
for doc in texts:
doc.metadata["source"] = file_name
# Rule-based κ²μμ μν μ‘°ν λ²νΈ μΆμΆ
match = re.search(r"μ \s*(\d+)\s*μ‘°", doc.page_content)
doc.metadata["article_id"] = match.group(1) if match else "None"
all_texts.extend(texts)
print(f"β
Total chunks: {len(all_texts)}")
# 2. ChromaDB (Vector)
print("\n>>> [Step 2] Creating Vector DB (Chroma)...")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = Chroma.from_documents(
documents=all_texts,
embedding=embeddings,
persist_directory=DB_PATH,
collection_name="hanyang_rules"
)
print("β
Vector DB created.")
# 3. BM25 (Lexical) - Tiktoken μ¬μ©
print("\n>>> [Step 3] Creating BM25 Index (with Tiktoken)...")
# GPT-5-mini λͺ¨λΈμ΄ μ¬μ©νλ ν ν¬λμ΄μ λ‘λ
try:
tokenizer = tiktoken.encoding_for_model("gpt-5-mini")
except KeyError:
tokenizer = tiktoken.get_encoding("cl100k_base")
def tiktoken_tokenizer(text):
# μ μ ID 리μ€νΈλ₯Ό λ¬Έμμ΄ λ¦¬μ€νΈλ‘ λ³νν΄μΌ BM25κ° μλν¨
tokens = tokenizer.encode(text)
return [str(t) for t in tokens]
tokenized_corpus = [tiktoken_tokenizer(doc.page_content) for doc in all_texts]
bm25 = BM25Okapi(tokenized_corpus)
bm25_data = {
"bm25": bm25,
"documents": all_texts
}
with open(BM25_PATH, "wb") as f:
pickle.dump(bm25_data, f)
print("β
BM25 Index saved.")
print("\n Ingestion Complete!")
if __name__ == "__main__":
ingest_data() |