File size: 2,936 Bytes
5063d6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561a0de
5063d6d
561a0de
5063d6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561a0de
5063d6d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import pickle
import re
import glob
import tiktoken
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from rank_bm25 import BM25Okapi
from langchain_core.documents import Document

load_dotenv()

DATA_DIR = "./data"
DB_PATH = "./db/chroma"
BM25_PATH = "./db/bm25.pkl"

def ingest_data():
    if not os.path.exists("./db"):
        os.makedirs("./db")

    print(">>> [Step 1] Loading & Splitting Data...")
    file_paths = glob.glob(os.path.join(DATA_DIR, "*.txt"))
    if not file_paths:
        print("❌ No .txt files found.")
        return

    all_texts = []
    # 청크 μ‚¬μ΄μ¦ˆ μ„€μ •
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=600,
        chunk_overlap=100,
        separators=["\n\n", "\n", " ", ""]
    )

    for file_path in file_paths:
        file_name = os.path.basename(file_path).replace(".txt", "")
        print(f"   - Processing: {file_name}")
        
        loader = TextLoader(file_path, encoding="utf-8")
        documents = loader.load()
        texts = text_splitter.split_documents(documents)
        
        for doc in texts:
            doc.metadata["source"] = file_name
            # Rule-based 검색을 μœ„ν•œ μ‘°ν•­ 번호 μΆ”μΆœ
            match = re.search(r"제\s*(\d+)\s*쑰", doc.page_content)
            doc.metadata["article_id"] = match.group(1) if match else "None"
        
        all_texts.extend(texts)

    print(f"βœ… Total chunks: {len(all_texts)}")

    # 2. ChromaDB (Vector)
    print("\n>>> [Step 2] Creating Vector DB (Chroma)...")
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    vector_store = Chroma.from_documents(
        documents=all_texts,
        embedding=embeddings,
        persist_directory=DB_PATH,
        collection_name="hanyang_rules"
    )
    print("βœ… Vector DB created.")

    # 3. BM25 (Lexical) - Tiktoken μ‚¬μš©
    print("\n>>> [Step 3] Creating BM25 Index (with Tiktoken)...")
    
    # GPT-5-mini λͺ¨λΈμ΄ μ‚¬μš©ν•˜λŠ” ν† ν¬λ‚˜μ΄μ € λ‘œλ“œ
    try:
        tokenizer = tiktoken.encoding_for_model("gpt-5-mini")
    except KeyError:
        tokenizer = tiktoken.get_encoding("cl100k_base")

    def tiktoken_tokenizer(text):
        # μ •μˆ˜ ID 리슀트λ₯Ό λ¬Έμžμ—΄ 리슀트둜 λ³€ν™˜ν•΄μ•Ό BM25κ°€ μž‘λ™ν•¨
        tokens = tokenizer.encode(text)
        return [str(t) for t in tokens]

    tokenized_corpus = [tiktoken_tokenizer(doc.page_content) for doc in all_texts]
    bm25 = BM25Okapi(tokenized_corpus)

    bm25_data = {
        "bm25": bm25,
        "documents": all_texts
    }
    
    with open(BM25_PATH, "wb") as f:
        pickle.dump(bm25_data, f)
    
    print("βœ… BM25 Index saved.")
    print("\n Ingestion Complete!")

if __name__ == "__main__":
    ingest_data()